src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/intl.h"
  20     #include "wx/log.h"
  21     #include "wx/utils.h"
  22     #include "wx/hashmap.h"
  23 #endif
  24
  25 #include "wx/strconv.h"
  26
  27 #if wxUSE_WCHAR_T
  28
  29 #ifdef __WINDOWS__
  30     #include "wx/msw/private.h"
  31     #include "wx/msw/missing.h"
  32 #endif
  33
  34 #ifndef __WXWINCE__
  35 #include <errno.h>
  36 #endif
  37
  38 #include <ctype.h>
  39 #include <string.h>
  40 #include <stdlib.h>
  41
  42 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  43     #define wxHAVE_WIN32_MB2WC
  44 #endif
  45
  46 #ifdef __SALFORDC__
  47     #include <clib.h>
  48 #endif
  49
  50 #ifdef HAVE_ICONV
  51     #include <iconv.h>
  52     #include "wx/thread.h"
  53 #endif
  54
  55 #include "wx/encconv.h"
  56 #include "wx/fontmap.h"
  57
  58 #ifdef __WXMAC__
  59 #ifndef __DARWIN__
  60 #include <ATSUnicode.h>
  61 #include <TextCommon.h>
  62 #include <TextEncodingConverter.h>
  63 #endif
  64
  65 // includes Mac headers
  66 #include "wx/mac/private.h"
  67 #endif
  68
  69
  70 #define TRACE_STRCONV _T("strconv")
  71
  72 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  73 // be 4 bytes
  74 #if SIZEOF_WCHAR_T == 2
  75     #define WC_UTF16
  76 #endif
  77
  78
  79 // ============================================================================
  80 // implementation
  81 // ============================================================================
  82
  83 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  84 static bool NotAllNULs(const char *p, size_t n)
  85 {
  86     while ( n && *p++ == '\0' )
  87         n--;
  88
  89     return n != 0;
  90 }
  91
  92 // ----------------------------------------------------------------------------
  93 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  94 // ----------------------------------------------------------------------------
  95
  96 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  97 {
  98     if (input <= 0xffff)
  99     {
 100         if (output)
 101             *output = (wxUint16) input;
 102
 103         return 1;
 104     }
 105     else if (input >= 0x110000)
 106     {
 107         return wxCONV_FAILED;
 108     }
 109     else
 110     {
 111         if (output)
 112         {
 113             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 114             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 115         }
 116
 117         return 2;
 118     }
 119 }
 120
 121 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 122 {
 123     if ((*input < 0xd800) || (*input > 0xdfff))
 124     {
 125         output = *input;
 126         return 1;
 127     }
 128     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 129     {
 130         output = *input;
 131         return wxCONV_FAILED;
 132     }
 133     else
 134     {
 135         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 136         return 2;
 137     }
 138 }
 139
 140 #ifdef WC_UTF16
 141     typedef wchar_t wxDecodeSurrogate_t;
 142 #else // !WC_UTF16
 143     typedef wxUint16 wxDecodeSurrogate_t;
 144 #endif // WC_UTF16/!WC_UTF16
 145
 146 // returns the next UTF-32 character from the wchar_t buffer and advances the
 147 // pointer to the character after this one
 148 //
 149 // if an invalid character is found, *pSrc is set to NULL, the caller must
 150 // check for this
 151 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 152 {
 153     wxUint32 out;
 154     const size_t
 155         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 156     if ( n == wxCONV_FAILED )
 157         *pSrc = NULL;
 158     else
 159         *pSrc += n;
 160
 161     return out;
 162 }
 163
 164 // ----------------------------------------------------------------------------
 165 // wxMBConv
 166 // ----------------------------------------------------------------------------
 167
 168 size_t
 169 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 170                   const char *src, size_t srcLen) const
 171 {
 172     // although new conversion classes are supposed to implement this function
 173     // directly, the existins ones only implement the old MB2WC() and so, to
 174     // avoid to have to rewrite all conversion classes at once, we provide a
 175     // default (but not efficient) implementation of this one in terms of the
 176     // old function by copying the input to ensure that it's NUL-terminated and
 177     // then using MB2WC() to convert it
 178
 179     // the number of chars [which would be] written to dst [if it were not NULL]
 180     size_t dstWritten = 0;
 181
 182     // the number of NULs terminating this string
 183     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 184
 185     // if we were not given the input size we just have to assume that the
 186     // string is properly terminated as we have no way of knowing how long it
 187     // is anyhow, but if we do have the size check whether there are enough
 188     // NULs at the end
 189     wxCharBuffer bufTmp;
 190     const char *srcEnd;
 191     if ( srcLen != wxNO_LEN )
 192     {
 193         // we need to know how to find the end of this string
 194         nulLen = GetMBNulLen();
 195         if ( nulLen == wxCONV_FAILED )
 196             return wxCONV_FAILED;
 197
 198         // if there are enough NULs we can avoid the copy
 199         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 200         {
 201             // make a copy in order to properly NUL-terminate the string
 202             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 203             char * const p = bufTmp.data();
 204             memcpy(p, src, srcLen);
 205             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 206                 *s = '\0';
 207
 208             src = bufTmp;
 209         }
 210
 211         srcEnd = src + srcLen;
 212     }
 213     else // quit after the first loop iteration
 214     {
 215         srcEnd = NULL;
 216     }
 217
 218     for ( ;; )
 219     {
 220         // try to convert the current chunk
 221         size_t lenChunk = MB2WC(NULL, src, 0);
 222         if ( lenChunk == wxCONV_FAILED )
 223             return wxCONV_FAILED;
 224
 225         lenChunk++; // for the L'\0' at the end of this chunk
 226
 227         dstWritten += lenChunk;
 228
 229         if ( lenChunk == 1 )
 230         {
 231             // nothing left in the input string, conversion succeeded
 232             break;
 233         }
 234
 235         if ( dst )
 236         {
 237             if ( dstWritten > dstLen )
 238                 return wxCONV_FAILED;
 239
 240             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 241                 return wxCONV_FAILED;
 242
 243             dst += lenChunk;
 244         }
 245
 246         if ( !srcEnd )
 247         {
 248             // we convert just one chunk in this case as this is the entire
 249             // string anyhow
 250             break;
 251         }
 252
 253         // advance the input pointer past the end of this chunk
 254         while ( NotAllNULs(src, nulLen) )
 255         {
 256             // notice that we must skip over multiple bytes here as we suppose
 257             // that if NUL takes 2 or 4 bytes, then all the other characters do
 258             // too and so if advanced by a single byte we might erroneously
 259             // detect sequences of NUL bytes in the middle of the input
 260             src += nulLen;
 261         }
 262
 263         src += nulLen; // skipping over its terminator as well
 264
 265         // note that ">=" (and not just "==") is needed here as the terminator
 266         // we skipped just above could be inside or just after the buffer
 267         // delimited by inEnd
 268         if ( src >= srcEnd )
 269             break;
 270     }
 271
 272     return dstWritten;
 273 }
 274
 275 size_t
 276 wxMBConv::FromWChar(char *dst, size_t dstLen,
 277                     const wchar_t *src, size_t srcLen) const
 278 {
 279     // the number of chars [which would be] written to dst [if it were not NULL]
 280     size_t dstWritten = 0;
 281
 282     // make a copy of the input string unless it is already properly
 283     // NUL-terminated
 284     //
 285     // if we don't know its length we have no choice but to assume that it is,
 286     // indeed, properly terminated
 287     wxWCharBuffer bufTmp;
 288     if ( srcLen == wxNO_LEN )
 289     {
 290         srcLen = wxWcslen(src) + 1;
 291     }
 292     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 293     {
 294         // make a copy in order to properly NUL-terminate the string
 295         bufTmp = wxWCharBuffer(srcLen);
 296         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 297         src = bufTmp;
 298     }
 299
 300     const size_t lenNul = GetMBNulLen();
 301     for ( const wchar_t * const srcEnd = src + srcLen;
 302           src < srcEnd;
 303           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 304     {
 305         // try to convert the current chunk
 306         size_t lenChunk = WC2MB(NULL, src, 0);
 307
 308         if ( lenChunk == wxCONV_FAILED )
 309             return wxCONV_FAILED;
 310
 311         lenChunk += lenNul;
 312         dstWritten += lenChunk;
 313
 314         if ( dst )
 315         {
 316             if ( dstWritten > dstLen )
 317                 return wxCONV_FAILED;
 318
 319             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 320                 return wxCONV_FAILED;
 321
 322             dst += lenChunk;
 323         }
 324     }
 325
 326     return dstWritten;
 327 }
 328
 329 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 330 {
 331     size_t rc = ToWChar(outBuff, outLen, inBuff);
 332     if ( rc != wxCONV_FAILED )
 333     {
 334         // ToWChar() returns the buffer length, i.e. including the trailing
 335         // NUL, while this method doesn't take it into account
 336         rc--;
 337     }
 338
 339     return rc;
 340 }
 341
 342 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 343 {
 344     size_t rc = FromWChar(outBuff, outLen, inBuff);
 345     if ( rc != wxCONV_FAILED )
 346     {
 347         rc -= GetMBNulLen();
 348     }
 349
 350     return rc;
 351 }
 352
 353 wxMBConv::~wxMBConv()
 354 {
 355     // nothing to do here (necessary for Darwin linking probably)
 356 }
 357
 358 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 359 {
 360     if ( psz )
 361     {
 362         // calculate the length of the buffer needed first
 363         const size_t nLen = MB2WC(NULL, psz, 0);
 364         if ( nLen != wxCONV_FAILED )
 365         {
 366             // now do the actual conversion
 367             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 368
 369             // +1 for the trailing NULL
 370             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 371                 return buf;
 372         }
 373     }
 374
 375     return wxWCharBuffer();
 376 }
 377
 378 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 379 {
 380     if ( pwz )
 381     {
 382         const size_t nLen = WC2MB(NULL, pwz, 0);
 383         if ( nLen != wxCONV_FAILED )
 384         {
 385             // extra space for trailing NUL(s)
 386             static const size_t extraLen = GetMaxMBNulLen();
 387
 388             wxCharBuffer buf(nLen + extraLen - 1);
 389             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 390                 return buf;
 391         }
 392     }
 393
 394     return wxCharBuffer();
 395 }
 396
 397 const wxWCharBuffer
 398 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 399 {
 400     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 401     if ( dstLen != wxCONV_FAILED )
 402     {
 403         wxWCharBuffer wbuf(dstLen - 1);
 404         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 405         {
 406             if ( outLen )
 407             {
 408                 *outLen = dstLen;
 409                 if ( wbuf[dstLen - 1] == L'\0' )
 410                     (*outLen)--;
 411             }
 412
 413             return wbuf;
 414         }
 415     }
 416
 417     if ( outLen )
 418         *outLen = 0;
 419
 420     return wxWCharBuffer();
 421 }
 422
 423 const wxCharBuffer
 424 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 425 {
 426     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 427     if ( dstLen != wxCONV_FAILED )
 428     {
 429         // special case of empty input: can't allocate 0 size buffer below as
 430         // wxCharBuffer insists on NUL-terminating it
 431         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 432         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 433         {
 434             if ( outLen )
 435             {
 436                 *outLen = dstLen;
 437
 438                 const size_t nulLen = GetMBNulLen();
 439                 if ( dstLen >= nulLen &&
 440                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 441                 {
 442                     // in this case the output is NUL-terminated and we're not
 443                     // supposed to count NUL
 444                     *outLen -= nulLen;
 445                 }
 446             }
 447
 448             return buf;
 449         }
 450     }
 451
 452     if ( outLen )
 453         *outLen = 0;
 454
 455     return wxCharBuffer();
 456 }
 457
 458 // ----------------------------------------------------------------------------
 459 // wxMBConvLibc
 460 // ----------------------------------------------------------------------------
 461
 462 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 463 {
 464     return wxMB2WC(buf, psz, n);
 465 }
 466
 467 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 468 {
 469     return wxWC2MB(buf, psz, n);
 470 }
 471
 472 // ----------------------------------------------------------------------------
 473 // wxConvBrokenFileNames
 474 // ----------------------------------------------------------------------------
 475
 476 #ifdef __UNIX__
 477
 478 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 479 {
 480     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 481                   || wxStricmp(charset, _T("UTF8")) == 0  )
 482         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 483     else
 484         m_conv = new wxCSConv(charset);
 485 }
 486
 487 #endif // __UNIX__
 488
 489 // ----------------------------------------------------------------------------
 490 // UTF-7
 491 // ----------------------------------------------------------------------------
 492
 493 // Implementation (C) 2004 Fredrik Roubert
 494
 495 //
 496 // BASE64 decoding table
 497 //
 498 static const unsigned char utf7unb64[] =
 499 {
 500     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 506     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 507     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 509     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 510     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 511     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 513     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 514     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 515     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 532 };
 533
 534 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 535 {
 536     size_t len = 0;
 537
 538     while ( *psz && (!buf || (len < n)) )
 539     {
 540         unsigned char cc = *psz++;
 541         if (cc != '+')
 542         {
 543             // plain ASCII char
 544             if (buf)
 545                 *buf++ = cc;
 546             len++;
 547         }
 548         else if (*psz == '-')
 549         {
 550             // encoded plus sign
 551             if (buf)
 552                 *buf++ = cc;
 553             len++;
 554             psz++;
 555         }
 556         else // start of BASE64 encoded string
 557         {
 558             bool lsb, ok;
 559             unsigned int d, l;
 560             for ( ok = lsb = false, d = 0, l = 0;
 561                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 562                   psz++ )
 563             {
 564                 d <<= 6;
 565                 d += cc;
 566                 for (l += 6; l >= 8; lsb = !lsb)
 567                 {
 568                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 569                     if (lsb)
 570                     {
 571                         if (buf)
 572                             *buf++ |= c;
 573                         len ++;
 574                     }
 575                     else
 576                     {
 577                         if (buf)
 578                             *buf = (wchar_t)(c << 8);
 579                     }
 580
 581                     ok = true;
 582                 }
 583             }
 584
 585             if ( !ok )
 586             {
 587                 // in valid UTF7 we should have valid characters after '+'
 588                 return wxCONV_FAILED;
 589             }
 590
 591             if (*psz == '-')
 592                 psz++;
 593         }
 594     }
 595
 596     if ( buf && (len < n) )
 597         *buf = '\0';
 598
 599     return len;
 600 }
 601
 602 //
 603 // BASE64 encoding table
 604 //
 605 static const unsigned char utf7enb64[] =
 606 {
 607     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 608     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 609     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 610     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 611     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 612     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 613     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 614     '4', '5', '6', '7', '8', '9', '+', '/'
 615 };
 616
 617 //
 618 // UTF-7 encoding table
 619 //
 620 // 0 - Set D (directly encoded characters)
 621 // 1 - Set O (optional direct characters)
 622 // 2 - whitespace characters (optional)
 623 // 3 - special characters
 624 //
 625 static const unsigned char utf7encode[128] =
 626 {
 627     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 628     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 629     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 630     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 631     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 632     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 633     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 634     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 635 };
 636
 637 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 638 {
 639     size_t len = 0;
 640
 641     while (*psz && ((!buf) || (len < n)))
 642     {
 643         wchar_t cc = *psz++;
 644         if (cc < 0x80 && utf7encode[cc] < 1)
 645         {
 646             // plain ASCII char
 647             if (buf)
 648                 *buf++ = (char)cc;
 649
 650             len++;
 651         }
 652 #ifndef WC_UTF16
 653         else if (((wxUint32)cc) > 0xffff)
 654         {
 655             // no surrogate pair generation (yet?)
 656             return wxCONV_FAILED;
 657         }
 658 #endif
 659         else
 660         {
 661             if (buf)
 662                 *buf++ = '+';
 663
 664             len++;
 665             if (cc != '+')
 666             {
 667                 // BASE64 encode string
 668                 unsigned int lsb, d, l;
 669                 for (d = 0, l = 0; /*nothing*/; psz++)
 670                 {
 671                     for (lsb = 0; lsb < 2; lsb ++)
 672                     {
 673                         d <<= 8;
 674                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 675
 676                         for (l += 8; l >= 6; )
 677                         {
 678                             l -= 6;
 679                             if (buf)
 680                                 *buf++ = utf7enb64[(d >> l) % 64];
 681                             len++;
 682                         }
 683                     }
 684
 685                     cc = *psz;
 686                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 687                         break;
 688                 }
 689
 690                 if (l != 0)
 691                 {
 692                     if (buf)
 693                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 694
 695                     len++;
 696                 }
 697             }
 698
 699             if (buf)
 700                 *buf++ = '-';
 701             len++;
 702         }
 703     }
 704
 705     if (buf && (len < n))
 706         *buf = 0;
 707
 708     return len;
 709 }
 710
 711 // ----------------------------------------------------------------------------
 712 // UTF-8
 713 // ----------------------------------------------------------------------------
 714
 715 static wxUint32 utf8_max[]=
 716     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 717
 718 // boundaries of the private use area we use to (temporarily) remap invalid
 719 // characters invalid in a UTF-8 encoded string
 720 const wxUint32 wxUnicodePUA = 0x100000;
 721 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 722
 723 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 724 {
 725     size_t len = 0;
 726
 727     while (*psz && ((!buf) || (len < n)))
 728     {
 729         const char *opsz = psz;
 730         bool invalid = false;
 731         unsigned char cc = *psz++, fc = cc;
 732         unsigned cnt;
 733         for (cnt = 0; fc & 0x80; cnt++)
 734             fc <<= 1;
 735
 736         if (!cnt)
 737         {
 738             // plain ASCII char
 739             if (buf)
 740                 *buf++ = cc;
 741             len++;
 742
 743             // escape the escape character for octal escapes
 744             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 745                     && cc == '\\' && (!buf || len < n))
 746             {
 747                 if (buf)
 748                     *buf++ = cc;
 749                 len++;
 750             }
 751         }
 752         else
 753         {
 754             cnt--;
 755             if (!cnt)
 756             {
 757                 // invalid UTF-8 sequence
 758                 invalid = true;
 759             }
 760             else
 761             {
 762                 unsigned ocnt = cnt - 1;
 763                 wxUint32 res = cc & (0x3f >> cnt);
 764                 while (cnt--)
 765                 {
 766                     cc = *psz;
 767                     if ((cc & 0xC0) != 0x80)
 768                     {
 769                         // invalid UTF-8 sequence
 770                         invalid = true;
 771                         break;
 772                     }
 773
 774                     psz++;
 775                     res = (res << 6) | (cc & 0x3f);
 776                 }
 777
 778                 if (invalid || res <= utf8_max[ocnt])
 779                 {
 780                     // illegal UTF-8 encoding
 781                     invalid = true;
 782                 }
 783                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 784                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 785                 {
 786                     // if one of our PUA characters turns up externally
 787                     // it must also be treated as an illegal sequence
 788                     // (a bit like you have to escape an escape character)
 789                     invalid = true;
 790                 }
 791                 else
 792                 {
 793 #ifdef WC_UTF16
 794                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 795                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 796                     if (pa == wxCONV_FAILED)
 797                     {
 798                         invalid = true;
 799                     }
 800                     else
 801                     {
 802                         if (buf)
 803                             buf += pa;
 804                         len += pa;
 805                     }
 806 #else // !WC_UTF16
 807                     if (buf)
 808                         *buf++ = (wchar_t)res;
 809                     len++;
 810 #endif // WC_UTF16/!WC_UTF16
 811                 }
 812             }
 813
 814             if (invalid)
 815             {
 816                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 817                 {
 818                     while (opsz < psz && (!buf || len < n))
 819                     {
 820 #ifdef WC_UTF16
 821                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 822                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 823                         wxASSERT(pa != wxCONV_FAILED);
 824                         if (buf)
 825                             buf += pa;
 826                         opsz++;
 827                         len += pa;
 828 #else
 829                         if (buf)
 830                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 831                         opsz++;
 832                         len++;
 833 #endif
 834                     }
 835                 }
 836                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 837                 {
 838                     while (opsz < psz && (!buf || len < n))
 839                     {
 840                         if ( buf && len + 3 < n )
 841                         {
 842                             unsigned char on = *opsz;
 843                             *buf++ = L'\\';
 844                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 845                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 846                             *buf++ = (wchar_t)( L'0' + on % 010 );
 847                         }
 848
 849                         opsz++;
 850                         len += 4;
 851                     }
 852                 }
 853                 else // MAP_INVALID_UTF8_NOT
 854                 {
 855                     return wxCONV_FAILED;
 856                 }
 857             }
 858         }
 859     }
 860
 861     if (buf && (len < n))
 862         *buf = 0;
 863
 864     return len;
 865 }
 866
 867 static inline bool isoctal(wchar_t wch)
 868 {
 869     return L'0' <= wch && wch <= L'7';
 870 }
 871
 872 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 873 {
 874     size_t len = 0;
 875
 876     while (*psz && ((!buf) || (len < n)))
 877     {
 878         wxUint32 cc;
 879
 880 #ifdef WC_UTF16
 881         // cast is ok for WC_UTF16
 882         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 883         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 884 #else
 885         cc = (*psz++) & 0x7fffffff;
 886 #endif
 887
 888         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 889                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 890         {
 891             if (buf)
 892                 *buf++ = (char)(cc - wxUnicodePUA);
 893             len++;
 894         }
 895         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 896                     && cc == L'\\' && psz[0] == L'\\' )
 897         {
 898             if (buf)
 899                 *buf++ = (char)cc;
 900             psz++;
 901             len++;
 902         }
 903         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 904                     cc == L'\\' &&
 905                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 906         {
 907             if (buf)
 908             {
 909                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 910                                  (psz[1] - L'0') * 010 +
 911                                  (psz[2] - L'0'));
 912             }
 913
 914             psz += 3;
 915             len++;
 916         }
 917         else
 918         {
 919             unsigned cnt;
 920             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 921             {
 922             }
 923
 924             if (!cnt)
 925             {
 926                 // plain ASCII char
 927                 if (buf)
 928                     *buf++ = (char) cc;
 929                 len++;
 930             }
 931             else
 932             {
 933                 len += cnt + 1;
 934                 if (buf)
 935                 {
 936                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 937                     while (cnt--)
 938                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 939                 }
 940             }
 941         }
 942     }
 943
 944     if (buf && (len < n))
 945         *buf = 0;
 946
 947     return len;
 948 }
 949
 950 // ============================================================================
 951 // UTF-16
 952 // ============================================================================
 953
 954 #ifdef WORDS_BIGENDIAN
 955     #define wxMBConvUTF16straight wxMBConvUTF16BE
 956     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 957 #else
 958     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 959     #define wxMBConvUTF16straight wxMBConvUTF16LE
 960 #endif
 961
 962 /* static */
 963 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 964 {
 965     if ( srcLen == wxNO_LEN )
 966     {
 967         // count the number of bytes in input, including the trailing NULs
 968         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 969         for ( srcLen = 1; *inBuff++; srcLen++ )
 970             ;
 971
 972         srcLen *= BYTES_PER_CHAR;
 973     }
 974     else // we already have the length
 975     {
 976         // we can only convert an entire number of UTF-16 characters
 977         if ( srcLen % BYTES_PER_CHAR )
 978             return wxCONV_FAILED;
 979     }
 980
 981     return srcLen;
 982 }
 983
 984 // case when in-memory representation is UTF-16 too
 985 #ifdef WC_UTF16
 986
 987 // ----------------------------------------------------------------------------
 988 // conversions without endianness change
 989 // ----------------------------------------------------------------------------
 990
 991 size_t
 992 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 993                                const char *src, size_t srcLen) const
 994 {
 995     // set up the scene for using memcpy() (which is presumably more efficient
 996     // than copying the bytes one by one)
 997     srcLen = GetLength(src, srcLen);
 998     if ( srcLen == wxNO_LEN )
 999         return wxCONV_FAILED;
1000
1001     const size_t inLen = srcLen / BYTES_PER_CHAR;
1002     if ( dst )
1003     {
1004         if ( dstLen < inLen )
1005             return wxCONV_FAILED;
1006
1007         memcpy(dst, src, srcLen);
1008     }
1009
1010     return inLen;
1011 }
1012
1013 size_t
1014 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1015                                  const wchar_t *src, size_t srcLen) const
1016 {
1017     if ( srcLen == wxNO_LEN )
1018         srcLen = wxWcslen(src) + 1;
1019
1020     srcLen *= BYTES_PER_CHAR;
1021
1022     if ( dst )
1023     {
1024         if ( dstLen < srcLen )
1025             return wxCONV_FAILED;
1026
1027         memcpy(dst, src, srcLen);
1028     }
1029
1030     return srcLen;
1031 }
1032
1033 // ----------------------------------------------------------------------------
1034 // endian-reversing conversions
1035 // ----------------------------------------------------------------------------
1036
1037 size_t
1038 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1039                            const char *src, size_t srcLen) const
1040 {
1041     srcLen = GetLength(src, srcLen);
1042     if ( srcLen == wxNO_LEN )
1043         return wxCONV_FAILED;
1044
1045     srcLen /= BYTES_PER_CHAR;
1046
1047     if ( dst )
1048     {
1049         if ( dstLen < srcLen )
1050             return wxCONV_FAILED;
1051
1052         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1053         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1054         {
1055             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1056         }
1057     }
1058
1059     return srcLen;
1060 }
1061
1062 size_t
1063 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1064                              const wchar_t *src, size_t srcLen) const
1065 {
1066     if ( srcLen == wxNO_LEN )
1067         srcLen = wxWcslen(src) + 1;
1068
1069     srcLen *= BYTES_PER_CHAR;
1070
1071     if ( dst )
1072     {
1073         if ( dstLen < srcLen )
1074             return wxCONV_FAILED;
1075
1076         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1077         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1078         {
1079             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1080         }
1081     }
1082
1083     return srcLen;
1084 }
1085
1086 #else // !WC_UTF16: wchar_t is UTF-32
1087
1088 // ----------------------------------------------------------------------------
1089 // conversions without endianness change
1090 // ----------------------------------------------------------------------------
1091
1092 size_t
1093 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1094                                const char *src, size_t srcLen) const
1095 {
1096     srcLen = GetLength(src, srcLen);
1097     if ( srcLen == wxNO_LEN )
1098         return wxCONV_FAILED;
1099
1100     const size_t inLen = srcLen / BYTES_PER_CHAR;
1101     if ( !dst )
1102     {
1103         // optimization: return maximal space which could be needed for this
1104         // string even if the real size could be smaller if the buffer contains
1105         // any surrogates
1106         return inLen;
1107     }
1108
1109     size_t outLen = 0;
1110     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1111     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1112     {
1113         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1114         if ( !inBuff )
1115             return wxCONV_FAILED;
1116
1117         if ( ++outLen > dstLen )
1118             return wxCONV_FAILED;
1119
1120         *dst++ = ch;
1121     }
1122
1123
1124     return outLen;
1125 }
1126
1127 size_t
1128 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1129                                  const wchar_t *src, size_t srcLen) const
1130 {
1131     if ( srcLen == wxNO_LEN )
1132         srcLen = wxWcslen(src) + 1;
1133
1134     size_t outLen = 0;
1135     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1136     for ( size_t n = 0; n < srcLen; n++ )
1137     {
1138         wxUint16 cc[2];
1139         const size_t numChars = encode_utf16(*src++, cc);
1140         if ( numChars == wxCONV_FAILED )
1141             return wxCONV_FAILED;
1142
1143         outLen += numChars * BYTES_PER_CHAR;
1144         if ( outBuff )
1145         {
1146             if ( outLen > dstLen )
1147                 return wxCONV_FAILED;
1148
1149             *outBuff++ = cc[0];
1150             if ( numChars == 2 )
1151             {
1152                 // second character of a surrogate
1153                 *outBuff++ = cc[1];
1154             }
1155         }
1156     }
1157
1158     return outLen;
1159 }
1160
1161 // ----------------------------------------------------------------------------
1162 // endian-reversing conversions
1163 // ----------------------------------------------------------------------------
1164
1165 size_t
1166 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1167                            const char *src, size_t srcLen) const
1168 {
1169     srcLen = GetLength(src, srcLen);
1170     if ( srcLen == wxNO_LEN )
1171         return wxCONV_FAILED;
1172
1173     const size_t inLen = srcLen / BYTES_PER_CHAR;
1174     if ( !dst )
1175     {
1176         // optimization: return maximal space which could be needed for this
1177         // string even if the real size could be smaller if the buffer contains
1178         // any surrogates
1179         return inLen;
1180     }
1181
1182     size_t outLen = 0;
1183     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1184     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1185     {
1186         wxUint32 ch;
1187         wxUint16 tmp[2];
1188
1189         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1190         inBuff++;
1191         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1192
1193         const size_t numChars = decode_utf16(tmp, ch);
1194         if ( numChars == wxCONV_FAILED )
1195             return wxCONV_FAILED;
1196
1197         if ( numChars == 2 )
1198             inBuff++;
1199
1200         if ( ++outLen > dstLen )
1201             return wxCONV_FAILED;
1202
1203         *dst++ = ch;
1204     }
1205
1206
1207     return outLen;
1208 }
1209
1210 size_t
1211 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1212                              const wchar_t *src, size_t srcLen) const
1213 {
1214     if ( srcLen == wxNO_LEN )
1215         srcLen = wxWcslen(src) + 1;
1216
1217     size_t outLen = 0;
1218     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1219     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1220     {
1221         wxUint16 cc[2];
1222         const size_t numChars = encode_utf16(*src, cc);
1223         if ( numChars == wxCONV_FAILED )
1224             return wxCONV_FAILED;
1225
1226         outLen += numChars * BYTES_PER_CHAR;
1227         if ( outBuff )
1228         {
1229             if ( outLen > dstLen )
1230                 return wxCONV_FAILED;
1231
1232             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1233             if ( numChars == 2 )
1234             {
1235                 // second character of a surrogate
1236                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1237             }
1238         }
1239     }
1240
1241     return outLen;
1242 }
1243
1244 #endif // WC_UTF16/!WC_UTF16
1245
1246
1247 // ============================================================================
1248 // UTF-32
1249 // ============================================================================
1250
1251 #ifdef WORDS_BIGENDIAN
1252     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1253     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1254 #else
1255     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1256     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1257 #endif
1258
1259
1260 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1261 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1262
1263 /* static */
1264 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1265 {
1266     if ( srcLen == wxNO_LEN )
1267     {
1268         // count the number of bytes in input, including the trailing NULs
1269         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1270         for ( srcLen = 1; *inBuff++; srcLen++ )
1271             ;
1272
1273         srcLen *= BYTES_PER_CHAR;
1274     }
1275     else // we already have the length
1276     {
1277         // we can only convert an entire number of UTF-32 characters
1278         if ( srcLen % BYTES_PER_CHAR )
1279             return wxCONV_FAILED;
1280     }
1281
1282     return srcLen;
1283 }
1284
1285 // case when in-memory representation is UTF-16
1286 #ifdef WC_UTF16
1287
1288 // ----------------------------------------------------------------------------
1289 // conversions without endianness change
1290 // ----------------------------------------------------------------------------
1291
1292 size_t
1293 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1294                                const char *src, size_t srcLen) const
1295 {
1296     srcLen = GetLength(src, srcLen);
1297     if ( srcLen == wxNO_LEN )
1298         return wxCONV_FAILED;
1299
1300     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1301     const size_t inLen = srcLen / BYTES_PER_CHAR;
1302     size_t outLen = 0;
1303     for ( size_t n = 0; n < inLen; n++ )
1304     {
1305         wxUint16 cc[2];
1306         const size_t numChars = encode_utf16(*inBuff++, cc);
1307         if ( numChars == wxCONV_FAILED )
1308             return wxCONV_FAILED;
1309
1310         outLen += numChars;
1311         if ( dst )
1312         {
1313             if ( outLen > dstLen )
1314                 return wxCONV_FAILED;
1315
1316             *dst++ = cc[0];
1317             if ( numChars == 2 )
1318             {
1319                 // second character of a surrogate
1320                 *dst++ = cc[1];
1321             }
1322         }
1323     }
1324
1325     return outLen;
1326 }
1327
1328 size_t
1329 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1330                                  const wchar_t *src, size_t srcLen) const
1331 {
1332     if ( srcLen == wxNO_LEN )
1333         srcLen = wxWcslen(src) + 1;
1334
1335     if ( !dst )
1336     {
1337         // optimization: return maximal space which could be needed for this
1338         // string instead of the exact amount which could be less if there are
1339         // any surrogates in the input
1340         //
1341         // we consider that surrogates are rare enough to make it worthwhile to
1342         // avoid running the loop below at the cost of slightly extra memory
1343         // consumption
1344         return srcLen * BYTES_PER_CHAR;
1345     }
1346
1347     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1348     size_t outLen = 0;
1349     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1350     {
1351         const wxUint32 ch = wxDecodeSurrogate(&src);
1352         if ( !src )
1353             return wxCONV_FAILED;
1354
1355         outLen += BYTES_PER_CHAR;
1356
1357         if ( outLen > dstLen )
1358             return wxCONV_FAILED;
1359
1360         *outBuff++ = ch;
1361     }
1362
1363     return outLen;
1364 }
1365
1366 // ----------------------------------------------------------------------------
1367 // endian-reversing conversions
1368 // ----------------------------------------------------------------------------
1369
1370 size_t
1371 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1372                            const char *src, size_t srcLen) const
1373 {
1374     srcLen = GetLength(src, srcLen);
1375     if ( srcLen == wxNO_LEN )
1376         return wxCONV_FAILED;
1377
1378     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1379     const size_t inLen = srcLen / BYTES_PER_CHAR;
1380     size_t outLen = 0;
1381     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1382     {
1383         wxUint16 cc[2];
1384         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1385         if ( numChars == wxCONV_FAILED )
1386             return wxCONV_FAILED;
1387
1388         outLen += numChars;
1389         if ( dst )
1390         {
1391             if ( outLen > dstLen )
1392                 return wxCONV_FAILED;
1393
1394             *dst++ = cc[0];
1395             if ( numChars == 2 )
1396             {
1397                 // second character of a surrogate
1398                 *dst++ = cc[1];
1399             }
1400         }
1401     }
1402
1403     return outLen;
1404 }
1405
1406 size_t
1407 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1408                              const wchar_t *src, size_t srcLen) const
1409 {
1410     if ( srcLen == wxNO_LEN )
1411         srcLen = wxWcslen(src) + 1;
1412
1413     if ( !dst )
1414     {
1415         // optimization: return maximal space which could be needed for this
1416         // string instead of the exact amount which could be less if there are
1417         // any surrogates in the input
1418         //
1419         // we consider that surrogates are rare enough to make it worthwhile to
1420         // avoid running the loop below at the cost of slightly extra memory
1421         // consumption
1422         return srcLen*BYTES_PER_CHAR;
1423     }
1424
1425     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1426     size_t outLen = 0;
1427     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1428     {
1429         const wxUint32 ch = wxDecodeSurrogate(&src);
1430         if ( !src )
1431             return wxCONV_FAILED;
1432
1433         outLen += BYTES_PER_CHAR;
1434
1435         if ( outLen > dstLen )
1436             return wxCONV_FAILED;
1437
1438         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1439     }
1440
1441     return outLen;
1442 }
1443
1444 #else // !WC_UTF16: wchar_t is UTF-32
1445
1446 // ----------------------------------------------------------------------------
1447 // conversions without endianness change
1448 // ----------------------------------------------------------------------------
1449
1450 size_t
1451 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1452                                const char *src, size_t srcLen) const
1453 {
1454     // use memcpy() as it should be much faster than hand-written loop
1455     srcLen = GetLength(src, srcLen);
1456     if ( srcLen == wxNO_LEN )
1457         return wxCONV_FAILED;
1458
1459     const size_t inLen = srcLen/BYTES_PER_CHAR;
1460     if ( dst )
1461     {
1462         if ( dstLen < inLen )
1463             return wxCONV_FAILED;
1464
1465         memcpy(dst, src, srcLen);
1466     }
1467
1468     return inLen;
1469 }
1470
1471 size_t
1472 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1473                                  const wchar_t *src, size_t srcLen) const
1474 {
1475     if ( srcLen == wxNO_LEN )
1476         srcLen = wxWcslen(src) + 1;
1477
1478     srcLen *= BYTES_PER_CHAR;
1479
1480     if ( dst )
1481     {
1482         if ( dstLen < srcLen )
1483             return wxCONV_FAILED;
1484
1485         memcpy(dst, src, srcLen);
1486     }
1487
1488     return srcLen;
1489 }
1490
1491 // ----------------------------------------------------------------------------
1492 // endian-reversing conversions
1493 // ----------------------------------------------------------------------------
1494
1495 size_t
1496 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1497                            const char *src, size_t srcLen) const
1498 {
1499     srcLen = GetLength(src, srcLen);
1500     if ( srcLen == wxNO_LEN )
1501         return wxCONV_FAILED;
1502
1503     srcLen /= BYTES_PER_CHAR;
1504
1505     if ( dst )
1506     {
1507         if ( dstLen < srcLen )
1508             return wxCONV_FAILED;
1509
1510         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1511         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1512         {
1513             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1514         }
1515     }
1516
1517     return srcLen;
1518 }
1519
1520 size_t
1521 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1522                              const wchar_t *src, size_t srcLen) const
1523 {
1524     if ( srcLen == wxNO_LEN )
1525         srcLen = wxWcslen(src) + 1;
1526
1527     srcLen *= BYTES_PER_CHAR;
1528
1529     if ( dst )
1530     {
1531         if ( dstLen < srcLen )
1532             return wxCONV_FAILED;
1533
1534         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1535         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1536         {
1537             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1538         }
1539     }
1540
1541     return srcLen;
1542 }
1543
1544 #endif // WC_UTF16/!WC_UTF16
1545
1546
1547 // ============================================================================
1548 // The classes doing conversion using the iconv_xxx() functions
1549 // ============================================================================
1550
1551 #ifdef HAVE_ICONV
1552
1553 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1554 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1555 //     (unless there's yet another bug in glibc) the only case when iconv()
1556 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1557 //     left in the input buffer -- when _real_ error occurs,
1558 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1559 //     iconv() failure.
1560 //     [This bug does not appear in glibc 2.2.]
1561 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1562 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1563                                      (errno != E2BIG || bufLeft != 0))
1564 #else
1565 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1566 #endif
1567
1568 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1569
1570 #define ICONV_T_INVALID ((iconv_t)-1)
1571
1572 #if SIZEOF_WCHAR_T == 4
1573     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1574     #define WC_ENC      wxFONTENCODING_UTF32
1575 #elif SIZEOF_WCHAR_T == 2
1576     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1577     #define WC_ENC      wxFONTENCODING_UTF16
1578 #else // sizeof(wchar_t) != 2 nor 4
1579     // does this ever happen?
1580     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1581 #endif
1582
1583 // ----------------------------------------------------------------------------
1584 // wxMBConv_iconv: encapsulates an iconv character set
1585 // ----------------------------------------------------------------------------
1586
1587 class wxMBConv_iconv : public wxMBConv
1588 {
1589 public:
1590     wxMBConv_iconv(const wxChar *name);
1591     virtual ~wxMBConv_iconv();
1592
1593     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1594     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1595
1596     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1597     virtual size_t GetMBNulLen() const;
1598
1599     virtual wxMBConv *Clone() const
1600     {
1601         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1602         p->m_minMBCharWidth = m_minMBCharWidth;
1603         return p;
1604     }
1605
1606     bool IsOk() const
1607         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1608
1609 protected:
1610     // the iconv handlers used to translate from multibyte
1611     // to wide char and in the other direction
1612     iconv_t m2w,
1613             w2m;
1614
1615 #if wxUSE_THREADS
1616     // guards access to m2w and w2m objects
1617     wxMutex m_iconvMutex;
1618 #endif
1619
1620 private:
1621     // the name (for iconv_open()) of a wide char charset -- if none is
1622     // available on this machine, it will remain NULL
1623     static wxString ms_wcCharsetName;
1624
1625     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1626     // different endian-ness than the native one
1627     static bool ms_wcNeedsSwap;
1628
1629
1630     // name of the encoding handled by this conversion
1631     wxString m_name;
1632
1633     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1634     // initially
1635     size_t m_minMBCharWidth;
1636 };
1637
1638 // make the constructor available for unit testing
1639 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1640 {
1641     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1642     if ( !result->IsOk() )
1643     {
1644         delete result;
1645         return 0;
1646     }
1647
1648     return result;
1649 }
1650
1651 wxString wxMBConv_iconv::ms_wcCharsetName;
1652 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1653
1654 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1655               : m_name(name)
1656 {
1657     m_minMBCharWidth = 0;
1658
1659     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1660     // names for the charsets
1661     const wxCharBuffer cname(wxString(name).ToAscii());
1662
1663     // check for charset that represents wchar_t:
1664     if ( ms_wcCharsetName.empty() )
1665     {
1666         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1667
1668 #if wxUSE_FONTMAP
1669         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1670 #else // !wxUSE_FONTMAP
1671         static const wxChar *names[] =
1672         {
1673 #if SIZEOF_WCHAR_T == 4
1674             _T("UCS-4"),
1675 #elif SIZEOF_WCHAR_T = 2
1676             _T("UCS-2"),
1677 #endif
1678             NULL
1679         };
1680 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1681
1682         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1683         {
1684             const wxString nameCS(*names);
1685
1686             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1687             wxString nameXE(nameCS);
1688
1689 #ifdef WORDS_BIGENDIAN
1690                 nameXE += _T("BE");
1691 #else // little endian
1692                 nameXE += _T("LE");
1693 #endif
1694
1695             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1696                        nameXE.c_str());
1697
1698             m2w = iconv_open(nameXE.ToAscii(), cname);
1699             if ( m2w == ICONV_T_INVALID )
1700             {
1701                 // try charset w/o bytesex info (e.g. "UCS4")
1702                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1703                            nameCS.c_str());
1704                 m2w = iconv_open(nameCS.ToAscii(), cname);
1705
1706                 // and check for bytesex ourselves:
1707                 if ( m2w != ICONV_T_INVALID )
1708                 {
1709                     char    buf[2], *bufPtr;
1710                     wchar_t wbuf[2], *wbufPtr;
1711                     size_t  insz, outsz;
1712                     size_t  res;
1713
1714                     buf[0] = 'A';
1715                     buf[1] = 0;
1716                     wbuf[0] = 0;
1717                     insz = 2;
1718                     outsz = SIZEOF_WCHAR_T * 2;
1719                     wbufPtr = wbuf;
1720                     bufPtr = buf;
1721
1722                     res = iconv(
1723                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1724                         (char**)&wbufPtr, &outsz);
1725
1726                     if (ICONV_FAILED(res, insz))
1727                     {
1728                         wxLogLastError(wxT("iconv"));
1729                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1730                                    nameCS.c_str());
1731                     }
1732                     else // ok, can convert to this encoding, remember it
1733                     {
1734                         ms_wcCharsetName = nameCS;
1735                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1736                     }
1737                 }
1738             }
1739             else // use charset not requiring byte swapping
1740             {
1741                 ms_wcCharsetName = nameXE;
1742             }
1743         }
1744
1745         wxLogTrace(TRACE_STRCONV,
1746                    wxT("iconv wchar_t charset is \"%s\"%s"),
1747                    ms_wcCharsetName.empty() ? _T("<none>")
1748                                             : ms_wcCharsetName.c_str(),
1749                    ms_wcNeedsSwap ? _T(" (needs swap)")
1750                                   : _T(""));
1751     }
1752     else // we already have ms_wcCharsetName
1753     {
1754         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1755     }
1756
1757     if ( ms_wcCharsetName.empty() )
1758     {
1759         w2m = ICONV_T_INVALID;
1760     }
1761     else
1762     {
1763         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1764         if ( w2m == ICONV_T_INVALID )
1765         {
1766             wxLogTrace(TRACE_STRCONV,
1767                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1768                        ms_wcCharsetName.c_str(), cname.data());
1769         }
1770     }
1771 }
1772
1773 wxMBConv_iconv::~wxMBConv_iconv()
1774 {
1775     if ( m2w != ICONV_T_INVALID )
1776         iconv_close(m2w);
1777     if ( w2m != ICONV_T_INVALID )
1778         iconv_close(w2m);
1779 }
1780
1781 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1782 {
1783     // find the string length: notice that must be done differently for
1784     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1785     size_t inbuf;
1786     const size_t nulLen = GetMBNulLen();
1787     switch ( nulLen )
1788     {
1789         default:
1790             return wxCONV_FAILED;
1791
1792         case 1:
1793             inbuf = strlen(psz); // arguably more optimized than our version
1794             break;
1795
1796         case 2:
1797         case 4:
1798             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1799             // they also have to start at character boundary and not span two
1800             // adjacent characters
1801             const char *p;
1802             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1803                 ;
1804             inbuf = p - psz;
1805             break;
1806     }
1807
1808 #if wxUSE_THREADS
1809     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1810     //     Unfortunately there is a couple of global wxCSConv objects such as
1811     //     wxConvLocal that are used all over wx code, so we have to make sure
1812     //     the handle is used by at most one thread at the time. Otherwise
1813     //     only a few wx classes would be safe to use from non-main threads
1814     //     as MB<->WC conversion would fail "randomly".
1815     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1816 #endif // wxUSE_THREADS
1817
1818     size_t outbuf = n * SIZEOF_WCHAR_T;
1819     size_t res, cres;
1820     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1821     wchar_t *bufPtr = buf;
1822     const char *pszPtr = psz;
1823
1824     if (buf)
1825     {
1826         // have destination buffer, convert there
1827         cres = iconv(m2w,
1828                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1829                      (char**)&bufPtr, &outbuf);
1830         res = n - (outbuf / SIZEOF_WCHAR_T);
1831
1832         if (ms_wcNeedsSwap)
1833         {
1834             // convert to native endianness
1835             for ( unsigned i = 0; i < res; i++ )
1836                 buf[n] = WC_BSWAP(buf[i]);
1837         }
1838
1839         // NUL-terminate the string if there is any space left
1840         if (res < n)
1841             buf[res] = 0;
1842     }
1843     else
1844     {
1845         // no destination buffer... convert using temp buffer
1846         // to calculate destination buffer requirement
1847         wchar_t tbuf[8];
1848         res = 0;
1849
1850         do
1851         {
1852             bufPtr = tbuf;
1853             outbuf = 8 * SIZEOF_WCHAR_T;
1854
1855             cres = iconv(m2w,
1856                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1857                          (char**)&bufPtr, &outbuf );
1858
1859             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1860         }
1861         while ((cres == (size_t)-1) && (errno == E2BIG));
1862     }
1863
1864     if (ICONV_FAILED(cres, inbuf))
1865     {
1866         //VS: it is ok if iconv fails, hence trace only
1867         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1868         return wxCONV_FAILED;
1869     }
1870
1871     return res;
1872 }
1873
1874 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1875 {
1876 #if wxUSE_THREADS
1877     // NB: explained in MB2WC
1878     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1879 #endif
1880
1881     size_t inlen = wxWcslen(psz);
1882     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1883     size_t outbuf = n;
1884     size_t res, cres;
1885
1886     wchar_t *tmpbuf = 0;
1887
1888     if (ms_wcNeedsSwap)
1889     {
1890         // need to copy to temp buffer to switch endianness
1891         // (doing WC_BSWAP twice on the original buffer won't help, as it
1892         //  could be in read-only memory, or be accessed in some other thread)
1893         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1894         for ( size_t i = 0; i < inlen; i++ )
1895             tmpbuf[n] = WC_BSWAP(psz[i]);
1896
1897         tmpbuf[inlen] = L'\0';
1898         psz = tmpbuf;
1899     }
1900
1901     if (buf)
1902     {
1903         // have destination buffer, convert there
1904         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1905
1906         res = n - outbuf;
1907
1908         // NB: iconv was given only wcslen(psz) characters on input, and so
1909         //     it couldn't convert the trailing zero. Let's do it ourselves
1910         //     if there's some room left for it in the output buffer.
1911         if (res < n)
1912             buf[0] = 0;
1913     }
1914     else
1915     {
1916         // no destination buffer: convert using temp buffer
1917         // to calculate destination buffer requirement
1918         char tbuf[16];
1919         res = 0;
1920         do
1921         {
1922             buf = tbuf;
1923             outbuf = 16;
1924
1925             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1926
1927             res += 16 - outbuf;
1928         }
1929         while ((cres == (size_t)-1) && (errno == E2BIG));
1930     }
1931
1932     if (ms_wcNeedsSwap)
1933     {
1934         free(tmpbuf);
1935     }
1936
1937     if (ICONV_FAILED(cres, inbuf))
1938     {
1939         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1940         return wxCONV_FAILED;
1941     }
1942
1943     return res;
1944 }
1945
1946 size_t wxMBConv_iconv::GetMBNulLen() const
1947 {
1948     if ( m_minMBCharWidth == 0 )
1949     {
1950         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1951
1952 #if wxUSE_THREADS
1953         // NB: explained in MB2WC
1954         wxMutexLocker lock(self->m_iconvMutex);
1955 #endif
1956
1957         wchar_t *wnul = L"";
1958         char buf[8]; // should be enough for NUL in any encoding
1959         size_t inLen = sizeof(wchar_t),
1960                outLen = WXSIZEOF(buf);
1961         char *inBuff = (char *)wnul;
1962         char *outBuff = buf;
1963         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1964         {
1965             self->m_minMBCharWidth = (size_t)-1;
1966         }
1967         else // ok
1968         {
1969             self->m_minMBCharWidth = outBuff - buf;
1970         }
1971     }
1972
1973     return m_minMBCharWidth;
1974 }
1975
1976 #endif // HAVE_ICONV
1977
1978
1979 // ============================================================================
1980 // Win32 conversion classes
1981 // ============================================================================
1982
1983 #ifdef wxHAVE_WIN32_MB2WC
1984
1985 // from utils.cpp
1986 #if wxUSE_FONTMAP
1987 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1988 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1989 #endif
1990
1991 class wxMBConv_win32 : public wxMBConv
1992 {
1993 public:
1994     wxMBConv_win32()
1995     {
1996         m_CodePage = CP_ACP;
1997         m_minMBCharWidth = 0;
1998     }
1999
2000     wxMBConv_win32(const wxMBConv_win32& conv)
2001         : wxMBConv()
2002     {
2003         m_CodePage = conv.m_CodePage;
2004         m_minMBCharWidth = conv.m_minMBCharWidth;
2005     }
2006
2007 #if wxUSE_FONTMAP
2008     wxMBConv_win32(const wxChar* name)
2009     {
2010         m_CodePage = wxCharsetToCodepage(name);
2011         m_minMBCharWidth = 0;
2012     }
2013
2014     wxMBConv_win32(wxFontEncoding encoding)
2015     {
2016         m_CodePage = wxEncodingToCodepage(encoding);
2017         m_minMBCharWidth = 0;
2018     }
2019 #endif // wxUSE_FONTMAP
2020
2021     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2022     {
2023         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2024         // the behaviour is not compatible with the Unix version (using iconv)
2025         // and break the library itself, e.g. wxTextInputStream::NextChar()
2026         // wouldn't work if reading an incomplete MB char didn't result in an
2027         // error
2028         //
2029         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2030         // Win XP or newer and it is not supported for UTF-[78] so we always
2031         // use our own conversions in this case. See
2032         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2033         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2034         if ( m_CodePage == CP_UTF8 )
2035         {
2036             return wxConvUTF8.MB2WC(buf, psz, n);
2037         }
2038
2039         if ( m_CodePage == CP_UTF7 )
2040         {
2041             return wxConvUTF7.MB2WC(buf, psz, n);
2042         }
2043
2044         int flags = 0;
2045         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2046                 IsAtLeastWin2kSP4() )
2047         {
2048             flags = MB_ERR_INVALID_CHARS;
2049         }
2050
2051         const size_t len = ::MultiByteToWideChar
2052                              (
2053                                 m_CodePage,     // code page
2054                                 flags,          // flags: fall on error
2055                                 psz,            // input string
2056                                 -1,             // its length (NUL-terminated)
2057                                 buf,            // output string
2058                                 buf ? n : 0     // size of output buffer
2059                              );
2060         if ( !len )
2061         {
2062             // function totally failed
2063             return wxCONV_FAILED;
2064         }
2065
2066         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2067         // check if we succeeded, by doing a double trip:
2068         if ( !flags && buf )
2069         {
2070             const size_t mbLen = strlen(psz);
2071             wxCharBuffer mbBuf(mbLen);
2072             if ( ::WideCharToMultiByte
2073                    (
2074                       m_CodePage,
2075                       0,
2076                       buf,
2077                       -1,
2078                       mbBuf.data(),
2079                       mbLen + 1,        // size in bytes, not length
2080                       NULL,
2081                       NULL
2082                    ) == 0 ||
2083                   strcmp(mbBuf, psz) != 0 )
2084             {
2085                 // we didn't obtain the same thing we started from, hence
2086                 // the conversion was lossy and we consider that it failed
2087                 return wxCONV_FAILED;
2088             }
2089         }
2090
2091         // note that it returns count of written chars for buf != NULL and size
2092         // of the needed buffer for buf == NULL so in either case the length of
2093         // the string (which never includes the terminating NUL) is one less
2094         return len - 1;
2095     }
2096
2097     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2098     {
2099         /*
2100             we have a problem here: by default, WideCharToMultiByte() may
2101             replace characters unrepresentable in the target code page with bad
2102             quality approximations such as turning "1/2" symbol (U+00BD) into
2103             "1" for the code pages which don't have it and we, obviously, want
2104             to avoid this at any price
2105
2106             the trouble is that this function does it _silently_, i.e. it won't
2107             even tell us whether it did or not... Win98/2000 and higher provide
2108             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2109             we have to resort to a round trip, i.e. check that converting back
2110             results in the same string -- this is, of course, expensive but
2111             otherwise we simply can't be sure to not garble the data.
2112          */
2113
2114         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2115         // it doesn't work with CJK encodings (which we test for rather roughly
2116         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2117         // supporting it
2118         BOOL usedDef wxDUMMY_INITIALIZE(false);
2119         BOOL *pUsedDef;
2120         int flags;
2121         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2122         {
2123             // it's our lucky day
2124             flags = WC_NO_BEST_FIT_CHARS;
2125             pUsedDef = &usedDef;
2126         }
2127         else // old system or unsupported encoding
2128         {
2129             flags = 0;
2130             pUsedDef = NULL;
2131         }
2132
2133         const size_t len = ::WideCharToMultiByte
2134                              (
2135                                 m_CodePage,     // code page
2136                                 flags,          // either none or no best fit
2137                                 pwz,            // input string
2138                                 -1,             // it is (wide) NUL-terminated
2139                                 buf,            // output buffer
2140                                 buf ? n : 0,    // and its size
2141                                 NULL,           // default "replacement" char
2142                                 pUsedDef        // [out] was it used?
2143                              );
2144
2145         if ( !len )
2146         {
2147             // function totally failed
2148             return wxCONV_FAILED;
2149         }
2150
2151         // if we were really converting, check if we succeeded
2152         if ( buf )
2153         {
2154             if ( flags )
2155             {
2156                 // check if the conversion failed, i.e. if any replacements
2157                 // were done
2158                 if ( usedDef )
2159                     return wxCONV_FAILED;
2160             }
2161             else // we must resort to double tripping...
2162             {
2163                 wxWCharBuffer wcBuf(n);
2164                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2165                         wcscmp(wcBuf, pwz) != 0 )
2166                 {
2167                     // we didn't obtain the same thing we started from, hence
2168                     // the conversion was lossy and we consider that it failed
2169                     return wxCONV_FAILED;
2170                 }
2171             }
2172         }
2173
2174         // see the comment above for the reason of "len - 1"
2175         return len - 1;
2176     }
2177
2178     virtual size_t GetMBNulLen() const
2179     {
2180         if ( m_minMBCharWidth == 0 )
2181         {
2182             int len = ::WideCharToMultiByte
2183                         (
2184                             m_CodePage,     // code page
2185                             0,              // no flags
2186                             L"",            // input string
2187                             1,              // translate just the NUL
2188                             NULL,           // output buffer
2189                             0,              // and its size
2190                             NULL,           // no replacement char
2191                             NULL            // [out] don't care if it was used
2192                         );
2193
2194             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2195             switch ( len )
2196             {
2197                 default:
2198                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2199                     self->m_minMBCharWidth = (size_t)-1;
2200                     break;
2201
2202                 case 0:
2203                     self->m_minMBCharWidth = (size_t)-1;
2204                     break;
2205
2206                 case 1:
2207                 case 2:
2208                 case 4:
2209                     self->m_minMBCharWidth = len;
2210                     break;
2211             }
2212         }
2213
2214         return m_minMBCharWidth;
2215     }
2216
2217     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2218
2219     bool IsOk() const { return m_CodePage != -1; }
2220
2221 private:
2222     static bool CanUseNoBestFit()
2223     {
2224         static int s_isWin98Or2k = -1;
2225
2226         if ( s_isWin98Or2k == -1 )
2227         {
2228             int verMaj, verMin;
2229             switch ( wxGetOsVersion(&verMaj, &verMin) )
2230             {
2231                 case wxWIN95:
2232                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2233                     break;
2234
2235                 case wxWINDOWS_NT:
2236                     s_isWin98Or2k = verMaj >= 5;
2237                     break;
2238
2239                 default:
2240                     // unknown: be conservative by default
2241                     s_isWin98Or2k = 0;
2242                     break;
2243             }
2244
2245             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2246         }
2247
2248         return s_isWin98Or2k == 1;
2249     }
2250
2251     static bool IsAtLeastWin2kSP4()
2252     {
2253 #ifdef __WXWINCE__
2254         return false;
2255 #else
2256         static int s_isAtLeastWin2kSP4 = -1;
2257
2258         if ( s_isAtLeastWin2kSP4 == -1 )
2259         {
2260             OSVERSIONINFOEX ver;
2261
2262             memset(&ver, 0, sizeof(ver));
2263             ver.dwOSVersionInfoSize = sizeof(ver);
2264             GetVersionEx((OSVERSIONINFO*)&ver);
2265
2266             s_isAtLeastWin2kSP4 =
2267               ((ver.dwMajorVersion > 5) || // Vista+
2268                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2269                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2270                ver.wServicePackMajor >= 4)) // 2000 SP4+
2271               ? 1 : 0;
2272         }
2273
2274         return s_isAtLeastWin2kSP4 == 1;
2275 #endif
2276     }
2277
2278
2279     // the code page we're working with
2280     long m_CodePage;
2281
2282     // cached result of GetMBNulLen(), set to 0 initially meaning
2283     // "unknown"
2284     size_t m_minMBCharWidth;
2285 };
2286
2287 #endif // wxHAVE_WIN32_MB2WC
2288
2289 // ============================================================================
2290 // Cocoa conversion classes
2291 // ============================================================================
2292
2293 #if defined(__WXCOCOA__)
2294
2295 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2296 // Strangely enough, internally Core Foundation uses
2297 // UTF-32 internally quite a bit - its just not public (yet).
2298
2299 #include <CoreFoundation/CFString.h>
2300 #include <CoreFoundation/CFStringEncodingExt.h>
2301
2302 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2303 {
2304     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2305
2306     switch (encoding)
2307     {
2308         case wxFONTENCODING_DEFAULT :
2309             enc = CFStringGetSystemEncoding();
2310             break ;
2311
2312         case wxFONTENCODING_ISO8859_1 :
2313             enc = kCFStringEncodingISOLatin1 ;
2314             break ;
2315         case wxFONTENCODING_ISO8859_2 :
2316             enc = kCFStringEncodingISOLatin2;
2317             break ;
2318         case wxFONTENCODING_ISO8859_3 :
2319             enc = kCFStringEncodingISOLatin3 ;
2320             break ;
2321         case wxFONTENCODING_ISO8859_4 :
2322             enc = kCFStringEncodingISOLatin4;
2323             break ;
2324         case wxFONTENCODING_ISO8859_5 :
2325             enc = kCFStringEncodingISOLatinCyrillic;
2326             break ;
2327         case wxFONTENCODING_ISO8859_6 :
2328             enc = kCFStringEncodingISOLatinArabic;
2329             break ;
2330         case wxFONTENCODING_ISO8859_7 :
2331             enc = kCFStringEncodingISOLatinGreek;
2332             break ;
2333         case wxFONTENCODING_ISO8859_8 :
2334             enc = kCFStringEncodingISOLatinHebrew;
2335             break ;
2336         case wxFONTENCODING_ISO8859_9 :
2337             enc = kCFStringEncodingISOLatin5;
2338             break ;
2339         case wxFONTENCODING_ISO8859_10 :
2340             enc = kCFStringEncodingISOLatin6;
2341             break ;
2342         case wxFONTENCODING_ISO8859_11 :
2343             enc = kCFStringEncodingISOLatinThai;
2344             break ;
2345         case wxFONTENCODING_ISO8859_13 :
2346             enc = kCFStringEncodingISOLatin7;
2347             break ;
2348         case wxFONTENCODING_ISO8859_14 :
2349             enc = kCFStringEncodingISOLatin8;
2350             break ;
2351         case wxFONTENCODING_ISO8859_15 :
2352             enc = kCFStringEncodingISOLatin9;
2353             break ;
2354
2355         case wxFONTENCODING_KOI8 :
2356             enc = kCFStringEncodingKOI8_R;
2357             break ;
2358         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2359             enc = kCFStringEncodingDOSRussian;
2360             break ;
2361
2362 //      case wxFONTENCODING_BULGARIAN :
2363 //          enc = ;
2364 //          break ;
2365
2366         case wxFONTENCODING_CP437 :
2367             enc = kCFStringEncodingDOSLatinUS ;
2368             break ;
2369         case wxFONTENCODING_CP850 :
2370             enc = kCFStringEncodingDOSLatin1;
2371             break ;
2372         case wxFONTENCODING_CP852 :
2373             enc = kCFStringEncodingDOSLatin2;
2374             break ;
2375         case wxFONTENCODING_CP855 :
2376             enc = kCFStringEncodingDOSCyrillic;
2377             break ;
2378         case wxFONTENCODING_CP866 :
2379             enc = kCFStringEncodingDOSRussian ;
2380             break ;
2381         case wxFONTENCODING_CP874 :
2382             enc = kCFStringEncodingDOSThai;
2383             break ;
2384         case wxFONTENCODING_CP932 :
2385             enc = kCFStringEncodingDOSJapanese;
2386             break ;
2387         case wxFONTENCODING_CP936 :
2388             enc = kCFStringEncodingDOSChineseSimplif ;
2389             break ;
2390         case wxFONTENCODING_CP949 :
2391             enc = kCFStringEncodingDOSKorean;
2392             break ;
2393         case wxFONTENCODING_CP950 :
2394             enc = kCFStringEncodingDOSChineseTrad;
2395             break ;
2396         case wxFONTENCODING_CP1250 :
2397             enc = kCFStringEncodingWindowsLatin2;
2398             break ;
2399         case wxFONTENCODING_CP1251 :
2400             enc = kCFStringEncodingWindowsCyrillic ;
2401             break ;
2402         case wxFONTENCODING_CP1252 :
2403             enc = kCFStringEncodingWindowsLatin1 ;
2404             break ;
2405         case wxFONTENCODING_CP1253 :
2406             enc = kCFStringEncodingWindowsGreek;
2407             break ;
2408         case wxFONTENCODING_CP1254 :
2409             enc = kCFStringEncodingWindowsLatin5;
2410             break ;
2411         case wxFONTENCODING_CP1255 :
2412             enc = kCFStringEncodingWindowsHebrew ;
2413             break ;
2414         case wxFONTENCODING_CP1256 :
2415             enc = kCFStringEncodingWindowsArabic ;
2416             break ;
2417         case wxFONTENCODING_CP1257 :
2418             enc = kCFStringEncodingWindowsBalticRim;
2419             break ;
2420 //   This only really encodes to UTF7 (if that) evidently
2421 //        case wxFONTENCODING_UTF7 :
2422 //            enc = kCFStringEncodingNonLossyASCII ;
2423 //            break ;
2424         case wxFONTENCODING_UTF8 :
2425             enc = kCFStringEncodingUTF8 ;
2426             break ;
2427         case wxFONTENCODING_EUC_JP :
2428             enc = kCFStringEncodingEUC_JP;
2429             break ;
2430         case wxFONTENCODING_UTF16 :
2431             enc = kCFStringEncodingUnicode ;
2432             break ;
2433         case wxFONTENCODING_MACROMAN :
2434             enc = kCFStringEncodingMacRoman ;
2435             break ;
2436         case wxFONTENCODING_MACJAPANESE :
2437             enc = kCFStringEncodingMacJapanese ;
2438             break ;
2439         case wxFONTENCODING_MACCHINESETRAD :
2440             enc = kCFStringEncodingMacChineseTrad ;
2441             break ;
2442         case wxFONTENCODING_MACKOREAN :
2443             enc = kCFStringEncodingMacKorean ;
2444             break ;
2445         case wxFONTENCODING_MACARABIC :
2446             enc = kCFStringEncodingMacArabic ;
2447             break ;
2448         case wxFONTENCODING_MACHEBREW :
2449             enc = kCFStringEncodingMacHebrew ;
2450             break ;
2451         case wxFONTENCODING_MACGREEK :
2452             enc = kCFStringEncodingMacGreek ;
2453             break ;
2454         case wxFONTENCODING_MACCYRILLIC :
2455             enc = kCFStringEncodingMacCyrillic ;
2456             break ;
2457         case wxFONTENCODING_MACDEVANAGARI :
2458             enc = kCFStringEncodingMacDevanagari ;
2459             break ;
2460         case wxFONTENCODING_MACGURMUKHI :
2461             enc = kCFStringEncodingMacGurmukhi ;
2462             break ;
2463         case wxFONTENCODING_MACGUJARATI :
2464             enc = kCFStringEncodingMacGujarati ;
2465             break ;
2466         case wxFONTENCODING_MACORIYA :
2467             enc = kCFStringEncodingMacOriya ;
2468             break ;
2469         case wxFONTENCODING_MACBENGALI :
2470             enc = kCFStringEncodingMacBengali ;
2471             break ;
2472         case wxFONTENCODING_MACTAMIL :
2473             enc = kCFStringEncodingMacTamil ;
2474             break ;
2475         case wxFONTENCODING_MACTELUGU :
2476             enc = kCFStringEncodingMacTelugu ;
2477             break ;
2478         case wxFONTENCODING_MACKANNADA :
2479             enc = kCFStringEncodingMacKannada ;
2480             break ;
2481         case wxFONTENCODING_MACMALAJALAM :
2482             enc = kCFStringEncodingMacMalayalam ;
2483             break ;
2484         case wxFONTENCODING_MACSINHALESE :
2485             enc = kCFStringEncodingMacSinhalese ;
2486             break ;
2487         case wxFONTENCODING_MACBURMESE :
2488             enc = kCFStringEncodingMacBurmese ;
2489             break ;
2490         case wxFONTENCODING_MACKHMER :
2491             enc = kCFStringEncodingMacKhmer ;
2492             break ;
2493         case wxFONTENCODING_MACTHAI :
2494             enc = kCFStringEncodingMacThai ;
2495             break ;
2496         case wxFONTENCODING_MACLAOTIAN :
2497             enc = kCFStringEncodingMacLaotian ;
2498             break ;
2499         case wxFONTENCODING_MACGEORGIAN :
2500             enc = kCFStringEncodingMacGeorgian ;
2501             break ;
2502         case wxFONTENCODING_MACARMENIAN :
2503             enc = kCFStringEncodingMacArmenian ;
2504             break ;
2505         case wxFONTENCODING_MACCHINESESIMP :
2506             enc = kCFStringEncodingMacChineseSimp ;
2507             break ;
2508         case wxFONTENCODING_MACTIBETAN :
2509             enc = kCFStringEncodingMacTibetan ;
2510             break ;
2511         case wxFONTENCODING_MACMONGOLIAN :
2512             enc = kCFStringEncodingMacMongolian ;
2513             break ;
2514         case wxFONTENCODING_MACETHIOPIC :
2515             enc = kCFStringEncodingMacEthiopic ;
2516             break ;
2517         case wxFONTENCODING_MACCENTRALEUR :
2518             enc = kCFStringEncodingMacCentralEurRoman ;
2519             break ;
2520         case wxFONTENCODING_MACVIATNAMESE :
2521             enc = kCFStringEncodingMacVietnamese ;
2522             break ;
2523         case wxFONTENCODING_MACARABICEXT :
2524             enc = kCFStringEncodingMacExtArabic ;
2525             break ;
2526         case wxFONTENCODING_MACSYMBOL :
2527             enc = kCFStringEncodingMacSymbol ;
2528             break ;
2529         case wxFONTENCODING_MACDINGBATS :
2530             enc = kCFStringEncodingMacDingbats ;
2531             break ;
2532         case wxFONTENCODING_MACTURKISH :
2533             enc = kCFStringEncodingMacTurkish ;
2534             break ;
2535         case wxFONTENCODING_MACCROATIAN :
2536             enc = kCFStringEncodingMacCroatian ;
2537             break ;
2538         case wxFONTENCODING_MACICELANDIC :
2539             enc = kCFStringEncodingMacIcelandic ;
2540             break ;
2541         case wxFONTENCODING_MACROMANIAN :
2542             enc = kCFStringEncodingMacRomanian ;
2543             break ;
2544         case wxFONTENCODING_MACCELTIC :
2545             enc = kCFStringEncodingMacCeltic ;
2546             break ;
2547         case wxFONTENCODING_MACGAELIC :
2548             enc = kCFStringEncodingMacGaelic ;
2549             break ;
2550 //      case wxFONTENCODING_MACKEYBOARD :
2551 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2552 //          break ;
2553
2554         default :
2555             // because gcc is picky
2556             break ;
2557     }
2558
2559     return enc ;
2560 }
2561
2562 class wxMBConv_cocoa : public wxMBConv
2563 {
2564 public:
2565     wxMBConv_cocoa()
2566     {
2567         Init(CFStringGetSystemEncoding()) ;
2568     }
2569
2570     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2571     {
2572         m_encoding = conv.m_encoding;
2573     }
2574
2575 #if wxUSE_FONTMAP
2576     wxMBConv_cocoa(const wxChar* name)
2577     {
2578         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2579     }
2580 #endif
2581
2582     wxMBConv_cocoa(wxFontEncoding encoding)
2583     {
2584         Init( wxCFStringEncFromFontEnc(encoding) );
2585     }
2586
2587     ~wxMBConv_cocoa()
2588     {
2589     }
2590
2591     void Init( CFStringEncoding encoding)
2592     {
2593         m_encoding = encoding ;
2594     }
2595
2596     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2597     {
2598         wxASSERT(szUnConv);
2599
2600         CFStringRef theString = CFStringCreateWithBytes (
2601                                                 NULL, //the allocator
2602                                                 (const UInt8*)szUnConv,
2603                                                 strlen(szUnConv),
2604                                                 m_encoding,
2605                                                 false //no BOM/external representation
2606                                                 );
2607
2608         wxASSERT(theString);
2609
2610         size_t nOutLength = CFStringGetLength(theString);
2611
2612         if (szOut == NULL)
2613         {
2614             CFRelease(theString);
2615             return nOutLength;
2616         }
2617
2618         CFRange theRange = { 0, nOutSize };
2619
2620 #if SIZEOF_WCHAR_T == 4
2621         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2622 #endif
2623
2624         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2625
2626         CFRelease(theString);
2627
2628         szUniCharBuffer[nOutLength] = '\0';
2629
2630 #if SIZEOF_WCHAR_T == 4
2631         wxMBConvUTF16 converter;
2632         converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2633         delete [] szUniCharBuffer;
2634 #endif
2635
2636         return nOutLength;
2637     }
2638
2639     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2640     {
2641         wxASSERT(szUnConv);
2642
2643         size_t nRealOutSize;
2644         size_t nBufSize = wxWcslen(szUnConv);
2645         UniChar* szUniBuffer = (UniChar*) szUnConv;
2646
2647 #if SIZEOF_WCHAR_T == 4
2648         wxMBConvUTF16 converter ;
2649         nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2650         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2651         converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2652         nBufSize /= sizeof(UniChar);
2653 #endif
2654
2655         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2656                                 NULL, //allocator
2657                                 szUniBuffer,
2658                                 nBufSize,
2659                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2660                             );
2661
2662         wxASSERT(theString);
2663
2664         //Note that CER puts a BOM when converting to unicode
2665         //so we  check and use getchars instead in that case
2666         if (m_encoding == kCFStringEncodingUnicode)
2667         {
2668             if (szOut != NULL)
2669                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2670
2671             nRealOutSize = CFStringGetLength(theString) + 1;
2672         }
2673         else
2674         {
2675             CFStringGetBytes(
2676                 theString,
2677                 CFRangeMake(0, CFStringGetLength(theString)),
2678                 m_encoding,
2679                 0, //what to put in characters that can't be converted -
2680                     //0 tells CFString to return NULL if it meets such a character
2681                 false, //not an external representation
2682                 (UInt8*) szOut,
2683                 nOutSize,
2684                 (CFIndex*) &nRealOutSize
2685                         );
2686         }
2687
2688         CFRelease(theString);
2689
2690 #if SIZEOF_WCHAR_T == 4
2691         delete[] szUniBuffer;
2692 #endif
2693
2694         return  nRealOutSize - 1;
2695     }
2696
2697     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2698
2699     bool IsOk() const
2700     {
2701         return m_encoding != kCFStringEncodingInvalidId &&
2702               CFStringIsEncodingAvailable(m_encoding);
2703     }
2704
2705 private:
2706     CFStringEncoding m_encoding ;
2707 };
2708
2709 #endif // defined(__WXCOCOA__)
2710
2711 // ============================================================================
2712 // Mac conversion classes
2713 // ============================================================================
2714
2715 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2716
2717 class wxMBConv_mac : public wxMBConv
2718 {
2719 public:
2720     wxMBConv_mac()
2721     {
2722         Init(CFStringGetSystemEncoding()) ;
2723     }
2724
2725     wxMBConv_mac(const wxMBConv_mac& conv)
2726     {
2727         Init(conv.m_char_encoding);
2728     }
2729
2730 #if wxUSE_FONTMAP
2731     wxMBConv_mac(const wxChar* name)
2732     {
2733         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2734     }
2735 #endif
2736
2737     wxMBConv_mac(wxFontEncoding encoding)
2738     {
2739         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2740     }
2741
2742     ~wxMBConv_mac()
2743     {
2744         OSStatus status = noErr ;
2745         status = TECDisposeConverter(m_MB2WC_converter);
2746         status = TECDisposeConverter(m_WC2MB_converter);
2747     }
2748
2749
2750     void Init( TextEncodingBase encoding)
2751     {
2752         OSStatus status = noErr ;
2753         m_char_encoding = encoding ;
2754         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2755
2756         status = TECCreateConverter(&m_MB2WC_converter,
2757                                     m_char_encoding,
2758                                     m_unicode_encoding);
2759         status = TECCreateConverter(&m_WC2MB_converter,
2760                                     m_unicode_encoding,
2761                                     m_char_encoding);
2762     }
2763
2764     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2765     {
2766         OSStatus status = noErr ;
2767         ByteCount byteOutLen ;
2768         ByteCount byteInLen = strlen(psz) + 1;
2769         wchar_t *tbuf = NULL ;
2770         UniChar* ubuf = NULL ;
2771         size_t res = 0 ;
2772
2773         if (buf == NULL)
2774         {
2775             // Apple specs say at least 32
2776             n = wxMax( 32, byteInLen ) ;
2777             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2778         }
2779
2780         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2781
2782 #if SIZEOF_WCHAR_T == 4
2783         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2784 #else
2785         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2786 #endif
2787
2788         status = TECConvertText(
2789             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2790             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2791
2792 #if SIZEOF_WCHAR_T == 4
2793         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2794         // is not properly terminated we get random characters at the end
2795         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2796         wxMBConvUTF16 converter ;
2797         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2798         free( ubuf ) ;
2799 #else
2800         res = byteOutLen / sizeof( UniChar ) ;
2801 #endif
2802
2803         if ( buf == NULL )
2804              free(tbuf) ;
2805
2806         if ( buf  && res < n)
2807             buf[res] = 0;
2808
2809         return res ;
2810     }
2811
2812     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2813     {
2814         OSStatus status = noErr ;
2815         ByteCount byteOutLen ;
2816         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2817
2818         char *tbuf = NULL ;
2819
2820         if (buf == NULL)
2821         {
2822             // Apple specs say at least 32
2823             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2824             tbuf = (char*) malloc( n ) ;
2825         }
2826
2827         ByteCount byteBufferLen = n ;
2828         UniChar* ubuf = NULL ;
2829
2830 #if SIZEOF_WCHAR_T == 4
2831         wxMBConvUTF16 converter ;
2832         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2833         byteInLen = unicharlen ;
2834         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2835         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2836 #else
2837         ubuf = (UniChar*) psz ;
2838 #endif
2839
2840         status = TECConvertText(
2841             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2842             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2843
2844 #if SIZEOF_WCHAR_T == 4
2845         free( ubuf ) ;
2846 #endif
2847
2848         if ( buf == NULL )
2849             free(tbuf) ;
2850
2851         size_t res = byteOutLen ;
2852         if ( buf  && res < n)
2853         {
2854             buf[res] = 0;
2855
2856             //we need to double-trip to verify it didn't insert any ? in place
2857             //of bogus characters
2858             wxWCharBuffer wcBuf(n);
2859             size_t pszlen = wxWcslen(psz);
2860             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2861                         wxWcslen(wcBuf) != pszlen ||
2862                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2863             {
2864                 // we didn't obtain the same thing we started from, hence
2865                 // the conversion was lossy and we consider that it failed
2866                 return wxCONV_FAILED;
2867             }
2868         }
2869
2870         return res ;
2871     }
2872
2873     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2874
2875     bool IsOk() const
2876         { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2877
2878 private:
2879     TECObjectRef m_MB2WC_converter;
2880     TECObjectRef m_WC2MB_converter;
2881
2882     TextEncodingBase m_char_encoding;
2883     TextEncodingBase m_unicode_encoding;
2884 };
2885
2886 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2887
2888 // ============================================================================
2889 // wxEncodingConverter based conversion classes
2890 // ============================================================================
2891
2892 #if wxUSE_FONTMAP
2893
2894 class wxMBConv_wxwin : public wxMBConv
2895 {
2896 private:
2897     void Init()
2898     {
2899         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2900                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2901     }
2902
2903 public:
2904     // temporarily just use wxEncodingConverter stuff,
2905     // so that it works while a better implementation is built
2906     wxMBConv_wxwin(const wxChar* name)
2907     {
2908         if (name)
2909             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2910         else
2911             m_enc = wxFONTENCODING_SYSTEM;
2912
2913         Init();
2914     }
2915
2916     wxMBConv_wxwin(wxFontEncoding enc)
2917     {
2918         m_enc = enc;
2919
2920         Init();
2921     }
2922
2923     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2924     {
2925         size_t inbuf = strlen(psz);
2926         if (buf)
2927         {
2928             if (!m2w.Convert(psz, buf))
2929                 return wxCONV_FAILED;
2930         }
2931         return inbuf;
2932     }
2933
2934     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2935     {
2936         const size_t inbuf = wxWcslen(psz);
2937         if (buf)
2938         {
2939             if (!w2m.Convert(psz, buf))
2940                 return wxCONV_FAILED;
2941         }
2942
2943         return inbuf;
2944     }
2945
2946     virtual size_t GetMBNulLen() const
2947     {
2948         switch ( m_enc )
2949         {
2950             case wxFONTENCODING_UTF16BE:
2951             case wxFONTENCODING_UTF16LE:
2952                 return 2;
2953
2954             case wxFONTENCODING_UTF32BE:
2955             case wxFONTENCODING_UTF32LE:
2956                 return 4;
2957
2958             default:
2959                 return 1;
2960         }
2961     }
2962
2963     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2964
2965     bool IsOk() const { return m_ok; }
2966
2967 public:
2968     wxFontEncoding m_enc;
2969     wxEncodingConverter m2w, w2m;
2970
2971 private:
2972     // were we initialized successfully?
2973     bool m_ok;
2974
2975     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2976 };
2977
2978 // make the constructors available for unit testing
2979 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2980 {
2981     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2982     if ( !result->IsOk() )
2983     {
2984         delete result;
2985         return 0;
2986     }
2987
2988     return result;
2989 }
2990
2991 #endif // wxUSE_FONTMAP
2992
2993 // ============================================================================
2994 // wxCSConv implementation
2995 // ============================================================================
2996
2997 void wxCSConv::Init()
2998 {
2999     m_name = NULL;
3000     m_convReal =  NULL;
3001     m_deferred = true;
3002 }
3003
3004 wxCSConv::wxCSConv(const wxChar *charset)
3005 {
3006     Init();
3007
3008     if ( charset )
3009     {
3010         SetName(charset);
3011     }
3012
3013 #if wxUSE_FONTMAP
3014     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3015 #else
3016     m_encoding = wxFONTENCODING_SYSTEM;
3017 #endif
3018 }
3019
3020 wxCSConv::wxCSConv(wxFontEncoding encoding)
3021 {
3022     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3023     {
3024         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3025
3026         encoding = wxFONTENCODING_SYSTEM;
3027     }
3028
3029     Init();
3030
3031     m_encoding = encoding;
3032 }
3033
3034 wxCSConv::~wxCSConv()
3035 {
3036     Clear();
3037 }
3038
3039 wxCSConv::wxCSConv(const wxCSConv& conv)
3040         : wxMBConv()
3041 {
3042     Init();
3043
3044     SetName(conv.m_name);
3045     m_encoding = conv.m_encoding;
3046 }
3047
3048 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3049 {
3050     Clear();
3051
3052     SetName(conv.m_name);
3053     m_encoding = conv.m_encoding;
3054
3055     return *this;
3056 }
3057
3058 void wxCSConv::Clear()
3059 {
3060     free(m_name);
3061     delete m_convReal;
3062
3063     m_name = NULL;
3064     m_convReal = NULL;
3065 }
3066
3067 void wxCSConv::SetName(const wxChar *charset)
3068 {
3069     if (charset)
3070     {
3071         m_name = wxStrdup(charset);
3072         m_deferred = true;
3073     }
3074 }
3075
3076 #if wxUSE_FONTMAP
3077
3078 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3079                      wxEncodingNameCache );
3080
3081 static wxEncodingNameCache gs_nameCache;
3082 #endif
3083
3084 wxMBConv *wxCSConv::DoCreate() const
3085 {
3086 #if wxUSE_FONTMAP
3087     wxLogTrace(TRACE_STRCONV,
3088                wxT("creating conversion for %s"),
3089                (m_name ? m_name
3090                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3091 #endif // wxUSE_FONTMAP
3092
3093     // check for the special case of ASCII or ISO8859-1 charset: as we have
3094     // special knowledge of it anyhow, we don't need to create a special
3095     // conversion object
3096     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3097             m_encoding == wxFONTENCODING_DEFAULT )
3098     {
3099         // don't convert at all
3100         return NULL;
3101     }
3102
3103     // we trust OS to do conversion better than we can so try external
3104     // conversion methods first
3105     //
3106     // the full order is:
3107     //      1. OS conversion (iconv() under Unix or Win32 API)
3108     //      2. hard coded conversions for UTF
3109     //      3. wxEncodingConverter as fall back
3110
3111     // step (1)
3112 #ifdef HAVE_ICONV
3113 #if !wxUSE_FONTMAP
3114     if ( m_name )
3115 #endif // !wxUSE_FONTMAP
3116     {
3117         wxString name(m_name);
3118         wxFontEncoding encoding(m_encoding);
3119
3120         if ( !name.empty() )
3121         {
3122             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3123             if ( conv->IsOk() )
3124                 return conv;
3125
3126             delete conv;
3127
3128 #if wxUSE_FONTMAP
3129             encoding =
3130                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3131 #endif // wxUSE_FONTMAP
3132         }
3133 #if wxUSE_FONTMAP
3134         {
3135             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3136             if ( it != gs_nameCache.end() )
3137             {
3138                 if ( it->second.empty() )
3139                     return NULL;
3140
3141                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3142                 if ( conv->IsOk() )
3143                     return conv;
3144
3145                 delete conv;
3146             }
3147
3148             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3149
3150             for ( ; *names; ++names )
3151             {
3152                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3153                 if ( conv->IsOk() )
3154                 {
3155                     gs_nameCache[encoding] = *names;
3156                     return conv;
3157                 }
3158
3159                 delete conv;
3160             }
3161
3162             gs_nameCache[encoding] = _T(""); // cache the failure
3163         }
3164 #endif // wxUSE_FONTMAP
3165     }
3166 #endif // HAVE_ICONV
3167
3168 #ifdef wxHAVE_WIN32_MB2WC
3169     {
3170 #if wxUSE_FONTMAP
3171         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3172                                       : new wxMBConv_win32(m_encoding);
3173         if ( conv->IsOk() )
3174             return conv;
3175
3176         delete conv;
3177 #else
3178         return NULL;
3179 #endif
3180     }
3181 #endif // wxHAVE_WIN32_MB2WC
3182
3183 #if defined(__WXMAC__)
3184     {
3185         // leave UTF16 and UTF32 to the built-ins of wx
3186         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3187             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3188         {
3189 #if wxUSE_FONTMAP
3190             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3191                                         : new wxMBConv_mac(m_encoding);
3192 #else
3193             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3194 #endif
3195             if ( conv->IsOk() )
3196                  return conv;
3197
3198             delete conv;
3199         }
3200     }
3201 #endif
3202
3203 #if defined(__WXCOCOA__)
3204     {
3205         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3206         {
3207 #if wxUSE_FONTMAP
3208             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3209                                           : new wxMBConv_cocoa(m_encoding);
3210 #else
3211             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3212 #endif
3213
3214             if ( conv->IsOk() )
3215                  return conv;
3216
3217             delete conv;
3218         }
3219     }
3220 #endif
3221     // step (2)
3222     wxFontEncoding enc = m_encoding;
3223 #if wxUSE_FONTMAP
3224     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3225     {
3226         // use "false" to suppress interactive dialogs -- we can be called from
3227         // anywhere and popping up a dialog from here is the last thing we want to
3228         // do
3229         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3230     }
3231 #endif // wxUSE_FONTMAP
3232
3233     switch ( enc )
3234     {
3235         case wxFONTENCODING_UTF7:
3236              return new wxMBConvUTF7;
3237
3238         case wxFONTENCODING_UTF8:
3239              return new wxMBConvUTF8;
3240
3241         case wxFONTENCODING_UTF16BE:
3242              return new wxMBConvUTF16BE;
3243
3244         case wxFONTENCODING_UTF16LE:
3245              return new wxMBConvUTF16LE;
3246
3247         case wxFONTENCODING_UTF32BE:
3248              return new wxMBConvUTF32BE;
3249
3250         case wxFONTENCODING_UTF32LE:
3251              return new wxMBConvUTF32LE;
3252
3253         default:
3254              // nothing to do but put here to suppress gcc warnings
3255              break;
3256     }
3257
3258     // step (3)
3259 #if wxUSE_FONTMAP
3260     {
3261         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3262                                       : new wxMBConv_wxwin(m_encoding);
3263         if ( conv->IsOk() )
3264             return conv;
3265
3266         delete conv;
3267     }
3268 #endif // wxUSE_FONTMAP
3269
3270     // NB: This is a hack to prevent deadlock. What could otherwise happen
3271     //     in Unicode build: wxConvLocal creation ends up being here
3272     //     because of some failure and logs the error. But wxLog will try to
3273     //     attach timestamp, for which it will need wxConvLocal (to convert
3274     //     time to char* and then wchar_t*), but that fails, tries to log
3275     //     error, but wxLog has a (already locked) critical section that
3276     //     guards static buffer.
3277     static bool alreadyLoggingError = false;
3278     if (!alreadyLoggingError)
3279     {
3280         alreadyLoggingError = true;
3281         wxLogError(_("Cannot convert from the charset '%s'!"),
3282                    m_name ? m_name
3283                       :
3284 #if wxUSE_FONTMAP
3285                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3286 #else // !wxUSE_FONTMAP
3287                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3288 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3289               );
3290
3291         alreadyLoggingError = false;
3292     }
3293
3294     return NULL;
3295 }
3296
3297 void wxCSConv::CreateConvIfNeeded() const
3298 {
3299     if ( m_deferred )
3300     {
3301         wxCSConv *self = (wxCSConv *)this; // const_cast
3302
3303 #if wxUSE_INTL
3304         // if we don't have neither the name nor the encoding, use the default
3305         // encoding for this system
3306         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3307         {
3308             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3309         }
3310 #endif // wxUSE_INTL
3311
3312         self->m_convReal = DoCreate();
3313         self->m_deferred = false;
3314     }
3315 }
3316
3317 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3318 {
3319     CreateConvIfNeeded();
3320
3321     if (m_convReal)
3322         return m_convReal->MB2WC(buf, psz, n);
3323
3324     // latin-1 (direct)
3325     size_t len = strlen(psz);
3326
3327     if (buf)
3328     {
3329         for (size_t c = 0; c <= len; c++)
3330             buf[c] = (unsigned char)(psz[c]);
3331     }
3332
3333     return len;
3334 }
3335
3336 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3337 {
3338     CreateConvIfNeeded();
3339
3340     if (m_convReal)
3341         return m_convReal->WC2MB(buf, psz, n);
3342
3343     // latin-1 (direct)
3344     const size_t len = wxWcslen(psz);
3345     if (buf)
3346     {
3347         for (size_t c = 0; c <= len; c++)
3348         {
3349             if (psz[c] > 0xFF)
3350                 return wxCONV_FAILED;
3351
3352             buf[c] = (char)psz[c];
3353         }
3354     }
3355     else
3356     {
3357         for (size_t c = 0; c <= len; c++)
3358         {
3359             if (psz[c] > 0xFF)
3360                 return wxCONV_FAILED;
3361         }
3362     }
3363
3364     return len;
3365 }
3366
3367 size_t wxCSConv::GetMBNulLen() const
3368 {
3369     CreateConvIfNeeded();
3370
3371     if ( m_convReal )
3372     {
3373         return m_convReal->GetMBNulLen();
3374     }
3375
3376     return 1;
3377 }
3378
3379 // ----------------------------------------------------------------------------
3380 // globals
3381 // ----------------------------------------------------------------------------
3382
3383 #ifdef __WINDOWS__
3384     static wxMBConv_win32 wxConvLibcObj;
3385 #elif defined(__WXMAC__) && !defined(__MACH__)
3386     static wxMBConv_mac wxConvLibcObj ;
3387 #else
3388     static wxMBConvLibc wxConvLibcObj;
3389 #endif
3390
3391 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3392 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3393 static wxMBConvUTF7 wxConvUTF7Obj;
3394 static wxMBConvUTF8 wxConvUTF8Obj;
3395
3396 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3397 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3398 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3400 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3401 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3403 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3404 #ifdef __WXOSX__
3405                                     wxConvUTF8Obj;
3406 #else
3407                                     wxConvLibcObj;
3408 #endif
3409
3410 #else // !wxUSE_WCHAR_T
3411
3412 // stand-ins in absence of wchar_t
3413 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3414                                 wxConvISO8859_1,
3415                                 wxConvLocal,
3416                                 wxConvUTF8;
3417
3418 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T