src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  82 // be 4 bytes
  83 #if SIZEOF_WCHAR_T == 2
  84     #define WC_UTF16
  85 #endif
  86
  87 // ============================================================================
  88 // implementation
  89 // ============================================================================
  90
  91 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  92 static bool NotAllNULs(const char *p, size_t n)
  93 {
  94     while ( n && *p++ == '\0' )
  95         n--;
  96
  97     return n != 0;
  98 }
  99
 100 // ----------------------------------------------------------------------------
 101 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
 102 // ----------------------------------------------------------------------------
 103
 104 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 105 {
 106     if (input<=0xffff)
 107     {
 108         if (output)
 109             *output = (wxUint16) input;
 110         return 1;
 111     }
 112     else if (input>=0x110000)
 113     {
 114         return wxCONV_FAILED;
 115     }
 116     else
 117     {
 118         if (output)
 119         {
 120             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 121             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 122         }
 123         return 2;
 124     }
 125 }
 126
 127 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 128 {
 129     if ((*input<0xd800) || (*input>0xdfff))
 130     {
 131         output = *input;
 132         return 1;
 133     }
 134     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 135     {
 136         output = *input;
 137         return wxCONV_FAILED;
 138     }
 139     else
 140     {
 141         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 142         return 2;
 143     }
 144 }
 145
 146 #ifdef WC_UTF16
 147
 148 // returns the next UTF-32 character from the wchar_t buffer and advances the
 149 // pointer to the character after this one
 150 //
 151 // if an invalid character is found, *pSrc is set to NULL, the caller must
 152 // check for this
 153 static wxUint32 wxDecodeSurrogate(const wchar_t **pSrc)
 154 {
 155     wxUint32 out;
 156     const size_t n = decode_utf16(*pSrc, out);
 157     if ( n == wxCONV_FAILED )
 158         *pSrc = NULL;
 159     else
 160         *pSrc += n;
 161
 162     return out;
 163 }
 164
 165 #endif // WC_UTF16
 166
 167 // ----------------------------------------------------------------------------
 168 // wxMBConv
 169 // ----------------------------------------------------------------------------
 170
 171 size_t
 172 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 173                   const char *src, size_t srcLen) const
 174 {
 175     // although new conversion classes are supposed to implement this function
 176     // directly, the existins ones only implement the old MB2WC() and so, to
 177     // avoid to have to rewrite all conversion classes at once, we provide a
 178     // default (but not efficient) implementation of this one in terms of the
 179     // old function by copying the input to ensure that it's NUL-terminated and
 180     // then using MB2WC() to convert it
 181
 182     // the number of chars [which would be] written to dst [if it were not NULL]
 183     size_t dstWritten = 0;
 184
 185     // the number of NULs terminating this string
 186     size_t nulLen wxDUMMY_INITIALIZE(0);
 187
 188     // if we were not given the input size we just have to assume that the
 189     // string is properly terminated as we have no way of knowing how long it
 190     // is anyhow, but if we do have the size check whether there are enough
 191     // NULs at the end
 192     wxCharBuffer bufTmp;
 193     const char *srcEnd;
 194     if ( srcLen != wxNO_LEN )
 195     {
 196         // we need to know how to find the end of this string
 197         nulLen = GetMBNulLen();
 198         if ( nulLen == wxCONV_FAILED )
 199             return wxCONV_FAILED;
 200
 201         // if there are enough NULs we can avoid the copy
 202         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 203         {
 204             // make a copy in order to properly NUL-terminate the string
 205             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 206             char * const p = bufTmp.data();
 207             memcpy(p, src, srcLen);
 208             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 209                 *s = '\0';
 210
 211             src = bufTmp;
 212         }
 213
 214         srcEnd = src + srcLen;
 215     }
 216     else // quit after the first loop iteration
 217     {
 218         srcEnd = NULL;
 219     }
 220
 221     for ( ;; )
 222     {
 223         // try to convert the current chunk
 224         size_t lenChunk = MB2WC(NULL, src, 0);
 225         if ( lenChunk == wxCONV_FAILED )
 226             return wxCONV_FAILED;
 227
 228         lenChunk++; // for the L'\0' at the end of this chunk
 229
 230         dstWritten += lenChunk;
 231
 232         if ( lenChunk == 1 )
 233         {
 234             // nothing left in the input string, conversion succeeded
 235             break;
 236         }
 237
 238         if ( dst )
 239         {
 240             if ( dstWritten > dstLen )
 241                 return wxCONV_FAILED;
 242
 243             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 244                 return wxCONV_FAILED;
 245
 246             dst += lenChunk;
 247         }
 248
 249         if ( !srcEnd )
 250         {
 251             // we convert just one chunk in this case as this is the entire
 252             // string anyhow
 253             break;
 254         }
 255
 256         // advance the input pointer past the end of this chunk
 257         while ( NotAllNULs(src, nulLen) )
 258         {
 259             // notice that we must skip over multiple bytes here as we suppose
 260             // that if NUL takes 2 or 4 bytes, then all the other characters do
 261             // too and so if advanced by a single byte we might erroneously
 262             // detect sequences of NUL bytes in the middle of the input
 263             src += nulLen;
 264         }
 265
 266         src += nulLen; // skipping over its terminator as well
 267
 268         // note that ">=" (and not just "==") is needed here as the terminator
 269         // we skipped just above could be inside or just after the buffer
 270         // delimited by inEnd
 271         if ( src >= srcEnd )
 272             break;
 273     }
 274
 275     return dstWritten;
 276 }
 277
 278 size_t
 279 wxMBConv::FromWChar(char *dst, size_t dstLen,
 280                     const wchar_t *src, size_t srcLen) const
 281 {
 282     // the number of chars [which would be] written to dst [if it were not NULL]
 283     size_t dstWritten = 0;
 284
 285     // make a copy of the input string unless it is already properly
 286     // NUL-terminated
 287     //
 288     // if we don't know its length we have no choice but to assume that it is,
 289     // indeed, properly terminated
 290     wxWCharBuffer bufTmp;
 291     if ( srcLen == wxNO_LEN )
 292     {
 293         srcLen = wxWcslen(src) + 1;
 294     }
 295     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 296     {
 297         // make a copy in order to properly NUL-terminate the string
 298         bufTmp = wxWCharBuffer(srcLen);
 299         memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
 300         src = bufTmp;
 301     }
 302
 303     const size_t lenNul = GetMBNulLen();
 304     for ( const wchar_t * const srcEnd = src + srcLen;
 305           src < srcEnd;
 306           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 307     {
 308         // try to convert the current chunk
 309         size_t lenChunk = WC2MB(NULL, src, 0);
 310
 311         if ( lenChunk == wxCONV_FAILED )
 312             return wxCONV_FAILED;
 313
 314         lenChunk += lenNul;
 315         dstWritten += lenChunk;
 316
 317         if ( dst )
 318         {
 319             if ( dstWritten > dstLen )
 320                 return wxCONV_FAILED;
 321
 322             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 323                 return wxCONV_FAILED;
 324
 325             dst += lenChunk;
 326         }
 327     }
 328
 329     return dstWritten;
 330 }
 331
 332 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
 333 {
 334     size_t rc = ToWChar(out, outLen, in);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         // ToWChar() returns the buffer length, i.e. including the trailing
 338         // NUL, while this method doesn't take it into account
 339         rc--;
 340     }
 341
 342     return rc;
 343 }
 344
 345 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
 346 {
 347     size_t rc = FromWChar(out, outLen, in);
 348     if ( rc != wxCONV_FAILED )
 349     {
 350         rc -= GetMBNulLen();
 351     }
 352
 353     return rc;
 354 }
 355
 356 wxMBConv::~wxMBConv()
 357 {
 358     // nothing to do here (necessary for Darwin linking probably)
 359 }
 360
 361 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 362 {
 363     if ( psz )
 364     {
 365         // calculate the length of the buffer needed first
 366         const size_t nLen = MB2WC(NULL, psz, 0);
 367         if ( nLen != wxCONV_FAILED )
 368         {
 369             // now do the actual conversion
 370             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 371
 372             // +1 for the trailing NULL
 373             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 374                 return buf;
 375         }
 376     }
 377
 378     return wxWCharBuffer();
 379 }
 380
 381 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 382 {
 383     if ( pwz )
 384     {
 385         const size_t nLen = WC2MB(NULL, pwz, 0);
 386         if ( nLen != wxCONV_FAILED )
 387         {
 388             // extra space for trailing NUL(s)
 389             static const size_t extraLen = GetMaxMBNulLen();
 390
 391             wxCharBuffer buf(nLen + extraLen - 1);
 392             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 393                 return buf;
 394         }
 395     }
 396
 397     return wxCharBuffer();
 398 }
 399
 400 const wxWCharBuffer
 401 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 402 {
 403     const size_t dstLen = ToWChar(NULL, 0, in, inLen);
 404     if ( dstLen != wxCONV_FAILED )
 405     {
 406         wxWCharBuffer wbuf(dstLen - 1);
 407         if ( ToWChar(wbuf.data(), dstLen, in, inLen) != wxCONV_FAILED )
 408         {
 409             if ( outLen )
 410             {
 411                 *outLen = dstLen;
 412                 if ( wbuf[dstLen - 1] == L'\0' )
 413                     (*outLen)--;
 414             }
 415
 416             return wbuf;
 417         }
 418     }
 419
 420     if ( outLen )
 421         *outLen = 0;
 422
 423     return wxWCharBuffer();
 424 }
 425
 426 const wxCharBuffer
 427 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 428 {
 429     const size_t dstLen = FromWChar(NULL, 0, in, inLen);
 430     if ( dstLen != wxCONV_FAILED )
 431     {
 432         wxCharBuffer buf(dstLen - 1);
 433         if ( FromWChar(buf.data(), dstLen, in, inLen) != wxCONV_FAILED )
 434         {
 435             if ( outLen )
 436             {
 437                 *outLen = dstLen;
 438
 439                 const size_t nulLen = GetMBNulLen();
 440                 if ( !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 441                 {
 442                     // in this case the output is NUL-terminated and we're not
 443                     // supposed to count NUL
 444                     (*outLen) -= nulLen;
 445                 }
 446             }
 447
 448             return buf;
 449         }
 450     }
 451
 452     if ( outLen )
 453         *outLen = 0;
 454
 455     return wxCharBuffer();
 456 }
 457
 458 // ----------------------------------------------------------------------------
 459 // wxMBConvLibc
 460 // ----------------------------------------------------------------------------
 461
 462 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 463 {
 464     return wxMB2WC(buf, psz, n);
 465 }
 466
 467 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 468 {
 469     return wxWC2MB(buf, psz, n);
 470 }
 471
 472 // ----------------------------------------------------------------------------
 473 // wxConvBrokenFileNames
 474 // ----------------------------------------------------------------------------
 475
 476 #ifdef __UNIX__
 477
 478 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 479 {
 480     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 481                   || wxStricmp(charset, _T("UTF8")) == 0  )
 482         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 483     else
 484         m_conv = new wxCSConv(charset);
 485 }
 486
 487 #endif // __UNIX__
 488
 489 // ----------------------------------------------------------------------------
 490 // UTF-7
 491 // ----------------------------------------------------------------------------
 492
 493 // Implementation (C) 2004 Fredrik Roubert
 494
 495 //
 496 // BASE64 decoding table
 497 //
 498 static const unsigned char utf7unb64[] =
 499 {
 500     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 506     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 507     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 509     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 510     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 511     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 513     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 514     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 515     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 532 };
 533
 534 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 535 {
 536     size_t len = 0;
 537
 538     while ( *psz && (!buf || (len < n)) )
 539     {
 540         unsigned char cc = *psz++;
 541         if (cc != '+')
 542         {
 543             // plain ASCII char
 544             if (buf)
 545                 *buf++ = cc;
 546             len++;
 547         }
 548         else if (*psz == '-')
 549         {
 550             // encoded plus sign
 551             if (buf)
 552                 *buf++ = cc;
 553             len++;
 554             psz++;
 555         }
 556         else // start of BASE64 encoded string
 557         {
 558             bool lsb, ok;
 559             unsigned int d, l;
 560             for ( ok = lsb = false, d = 0, l = 0;
 561                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 562                   psz++ )
 563             {
 564                 d <<= 6;
 565                 d += cc;
 566                 for (l += 6; l >= 8; lsb = !lsb)
 567                 {
 568                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 569                     if (lsb)
 570                     {
 571                         if (buf)
 572                             *buf++ |= c;
 573                         len ++;
 574                     }
 575                     else
 576                     {
 577                         if (buf)
 578                             *buf = (wchar_t)(c << 8);
 579                     }
 580
 581                     ok = true;
 582                 }
 583             }
 584
 585             if ( !ok )
 586             {
 587                 // in valid UTF7 we should have valid characters after '+'
 588                 return wxCONV_FAILED;
 589             }
 590
 591             if (*psz == '-')
 592                 psz++;
 593         }
 594     }
 595
 596     if ( buf && (len < n) )
 597         *buf = '\0';
 598
 599     return len;
 600 }
 601
 602 //
 603 // BASE64 encoding table
 604 //
 605 static const unsigned char utf7enb64[] =
 606 {
 607     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 608     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 609     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 610     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 611     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 612     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 613     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 614     '4', '5', '6', '7', '8', '9', '+', '/'
 615 };
 616
 617 //
 618 // UTF-7 encoding table
 619 //
 620 // 0 - Set D (directly encoded characters)
 621 // 1 - Set O (optional direct characters)
 622 // 2 - whitespace characters (optional)
 623 // 3 - special characters
 624 //
 625 static const unsigned char utf7encode[128] =
 626 {
 627     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 628     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 629     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 630     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 631     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 632     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 633     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 634     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 635 };
 636
 637 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 638 {
 639     size_t len = 0;
 640
 641     while (*psz && ((!buf) || (len < n)))
 642     {
 643         wchar_t cc = *psz++;
 644         if (cc < 0x80 && utf7encode[cc] < 1)
 645         {
 646             // plain ASCII char
 647             if (buf)
 648                 *buf++ = (char)cc;
 649             len++;
 650         }
 651 #ifndef WC_UTF16
 652         else if (((wxUint32)cc) > 0xffff)
 653         {
 654             // no surrogate pair generation (yet?)
 655             return wxCONV_FAILED;
 656         }
 657 #endif
 658         else
 659         {
 660             if (buf)
 661                 *buf++ = '+';
 662             len++;
 663             if (cc != '+')
 664             {
 665                 // BASE64 encode string
 666                 unsigned int lsb, d, l;
 667                 for (d = 0, l = 0; /*nothing*/; psz++)
 668                 {
 669                     for (lsb = 0; lsb < 2; lsb ++)
 670                     {
 671                         d <<= 8;
 672                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 673
 674                         for (l += 8; l >= 6; )
 675                         {
 676                             l -= 6;
 677                             if (buf)
 678                                 *buf++ = utf7enb64[(d >> l) % 64];
 679                             len++;
 680                         }
 681                     }
 682                     cc = *psz;
 683                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 684                         break;
 685                 }
 686                 if (l != 0)
 687                 {
 688                     if (buf)
 689                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 690                     len++;
 691                 }
 692             }
 693             if (buf)
 694                 *buf++ = '-';
 695             len++;
 696         }
 697     }
 698     if (buf && (len < n))
 699         *buf = 0;
 700     return len;
 701 }
 702
 703 // ----------------------------------------------------------------------------
 704 // UTF-8
 705 // ----------------------------------------------------------------------------
 706
 707 static wxUint32 utf8_max[]=
 708     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 709
 710 // boundaries of the private use area we use to (temporarily) remap invalid
 711 // characters invalid in a UTF-8 encoded string
 712 const wxUint32 wxUnicodePUA = 0x100000;
 713 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 714
 715 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 716 {
 717     size_t len = 0;
 718
 719     while (*psz && ((!buf) || (len < n)))
 720     {
 721         const char *opsz = psz;
 722         bool invalid = false;
 723         unsigned char cc = *psz++, fc = cc;
 724         unsigned cnt;
 725         for (cnt = 0; fc & 0x80; cnt++)
 726             fc <<= 1;
 727         if (!cnt)
 728         {
 729             // plain ASCII char
 730             if (buf)
 731                 *buf++ = cc;
 732             len++;
 733
 734             // escape the escape character for octal escapes
 735             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 736                     && cc == '\\' && (!buf || len < n))
 737             {
 738                 if (buf)
 739                     *buf++ = cc;
 740                 len++;
 741             }
 742         }
 743         else
 744         {
 745             cnt--;
 746             if (!cnt)
 747             {
 748                 // invalid UTF-8 sequence
 749                 invalid = true;
 750             }
 751             else
 752             {
 753                 unsigned ocnt = cnt - 1;
 754                 wxUint32 res = cc & (0x3f >> cnt);
 755                 while (cnt--)
 756                 {
 757                     cc = *psz;
 758                     if ((cc & 0xC0) != 0x80)
 759                     {
 760                         // invalid UTF-8 sequence
 761                         invalid = true;
 762                         break;
 763                     }
 764                     psz++;
 765                     res = (res << 6) | (cc & 0x3f);
 766                 }
 767                 if (invalid || res <= utf8_max[ocnt])
 768                 {
 769                     // illegal UTF-8 encoding
 770                     invalid = true;
 771                 }
 772                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 773                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 774                 {
 775                     // if one of our PUA characters turns up externally
 776                     // it must also be treated as an illegal sequence
 777                     // (a bit like you have to escape an escape character)
 778                     invalid = true;
 779                 }
 780                 else
 781                 {
 782 #ifdef WC_UTF16
 783                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 784                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 785                     if (pa == wxCONV_FAILED)
 786                     {
 787                         invalid = true;
 788                     }
 789                     else
 790                     {
 791                         if (buf)
 792                             buf += pa;
 793                         len += pa;
 794                     }
 795 #else // !WC_UTF16
 796                     if (buf)
 797                         *buf++ = (wchar_t)res;
 798                     len++;
 799 #endif // WC_UTF16/!WC_UTF16
 800                 }
 801             }
 802             if (invalid)
 803             {
 804                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 805                 {
 806                     while (opsz < psz && (!buf || len < n))
 807                     {
 808 #ifdef WC_UTF16
 809                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 810                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 811                         wxASSERT(pa != wxCONV_FAILED);
 812                         if (buf)
 813                             buf += pa;
 814                         opsz++;
 815                         len += pa;
 816 #else
 817                         if (buf)
 818                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 819                         opsz++;
 820                         len++;
 821 #endif
 822                     }
 823                 }
 824                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 825                 {
 826                     while (opsz < psz && (!buf || len < n))
 827                     {
 828                         if ( buf && len + 3 < n )
 829                         {
 830                             unsigned char on = *opsz;
 831                             *buf++ = L'\\';
 832                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 833                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 834                             *buf++ = (wchar_t)( L'0' + on % 010 );
 835                         }
 836                         opsz++;
 837                         len += 4;
 838                     }
 839                 }
 840                 else // MAP_INVALID_UTF8_NOT
 841                 {
 842                     return wxCONV_FAILED;
 843                 }
 844             }
 845         }
 846     }
 847     if (buf && (len < n))
 848         *buf = 0;
 849     return len;
 850 }
 851
 852 static inline bool isoctal(wchar_t wch)
 853 {
 854     return L'0' <= wch && wch <= L'7';
 855 }
 856
 857 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 858 {
 859     size_t len = 0;
 860
 861     while (*psz && ((!buf) || (len < n)))
 862     {
 863         wxUint32 cc;
 864 #ifdef WC_UTF16
 865         // cast is ok for WC_UTF16
 866         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 867         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 868 #else
 869         cc=(*psz++) & 0x7fffffff;
 870 #endif
 871
 872         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 873                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 874         {
 875             if (buf)
 876                 *buf++ = (char)(cc - wxUnicodePUA);
 877             len++;
 878         }
 879         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 880                     && cc == L'\\' && psz[0] == L'\\' )
 881         {
 882             if (buf)
 883                 *buf++ = (char)cc;
 884             psz++;
 885             len++;
 886         }
 887         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 888                     cc == L'\\' &&
 889                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 890         {
 891             if (buf)
 892             {
 893                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 894                                  (psz[1] - L'0')*010 +
 895                                  (psz[2] - L'0'));
 896             }
 897
 898             psz += 3;
 899             len++;
 900         }
 901         else
 902         {
 903             unsigned cnt;
 904             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 905             if (!cnt)
 906             {
 907                 // plain ASCII char
 908                 if (buf)
 909                     *buf++ = (char) cc;
 910                 len++;
 911             }
 912
 913             else
 914             {
 915                 len += cnt + 1;
 916                 if (buf)
 917                 {
 918                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 919                     while (cnt--)
 920                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 921                 }
 922             }
 923         }
 924     }
 925
 926     if (buf && (len<n))
 927         *buf = 0;
 928
 929     return len;
 930 }
 931
 932 // ============================================================================
 933 // UTF-16
 934 // ============================================================================
 935
 936 #ifdef WORDS_BIGENDIAN
 937     #define wxMBConvUTF16straight wxMBConvUTF16BE
 938     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 939 #else
 940     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 941     #define wxMBConvUTF16straight wxMBConvUTF16LE
 942 #endif
 943
 944 /* static */
 945 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 946 {
 947     if ( srcLen == wxNO_LEN )
 948     {
 949         // count the number of bytes in input, including the trailing NULs
 950         const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
 951         for ( srcLen = 1; *in++; srcLen++ )
 952             ;
 953
 954         srcLen *= BYTES_PER_CHAR;
 955     }
 956     else // we already have the length
 957     {
 958         // we can only convert an entire number of UTF-16 characters
 959         if ( srcLen % BYTES_PER_CHAR )
 960             return wxCONV_FAILED;
 961     }
 962
 963     return srcLen;
 964 }
 965
 966 // case when in-memory representation is UTF-16 too
 967 #ifdef WC_UTF16
 968
 969 // ----------------------------------------------------------------------------
 970 // conversions without endianness change
 971 // ----------------------------------------------------------------------------
 972
 973 size_t
 974 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 975                                const char *src, size_t srcLen) const
 976 {
 977     // set up the scene for using memcpy() (which is presumably more efficient
 978     // than copying the bytes one by one)
 979     srcLen = GetLength(src, srcLen);
 980     if ( srcLen == wxNO_LEN )
 981         return wxCONV_FAILED;
 982
 983     const size_t inLen = srcLen/BYTES_PER_CHAR;
 984     if ( dst )
 985     {
 986         if ( dstLen < inLen )
 987             return wxCONV_FAILED;
 988
 989         memcpy(dst, src, srcLen);
 990     }
 991
 992     return inLen;
 993 }
 994
 995 size_t
 996 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
 997                                  const wchar_t *src, size_t srcLen) const
 998 {
 999     if ( srcLen == wxNO_LEN )
1000         srcLen = wxWcslen(src) + 1;
1001
1002     srcLen *= BYTES_PER_CHAR;
1003
1004     if ( dst )
1005     {
1006         if ( dstLen < srcLen )
1007             return wxCONV_FAILED;
1008
1009         memcpy(dst, src, srcLen);
1010     }
1011
1012     return srcLen;
1013 }
1014
1015 // ----------------------------------------------------------------------------
1016 // endian-reversing conversions
1017 // ----------------------------------------------------------------------------
1018
1019 size_t
1020 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1021                            const char *src, size_t srcLen) const
1022 {
1023     srcLen = GetLength(src, srcLen);
1024     if ( srcLen == wxNO_LEN )
1025         return wxCONV_FAILED;
1026
1027     srcLen /= BYTES_PER_CHAR;
1028
1029     if ( dst )
1030     {
1031         if ( dstLen < srcLen )
1032             return wxCONV_FAILED;
1033
1034         const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
1035         for ( size_t n = 0; n < srcLen; n++, in++ )
1036         {
1037             *dst++ = wxUINT16_SWAP_ALWAYS(*in);
1038         }
1039     }
1040
1041     return srcLen;
1042 }
1043
1044 size_t
1045 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1046                              const wchar_t *src, size_t srcLen) const
1047 {
1048     if ( srcLen == wxNO_LEN )
1049         srcLen = wxWcslen(src) + 1;
1050
1051     srcLen *= BYTES_PER_CHAR;
1052
1053     if ( dst )
1054     {
1055         if ( dstLen < srcLen )
1056             return wxCONV_FAILED;
1057
1058         wxUint16 *out = wx_reinterpret_cast(wxUint16 *, dst);
1059         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1060         {
1061             *out++ = wxUINT16_SWAP_ALWAYS(*src);
1062         }
1063     }
1064
1065     return srcLen;
1066 }
1067
1068 #else // !WC_UTF16: wchar_t is UTF-32
1069
1070 // ----------------------------------------------------------------------------
1071 // conversions without endianness change
1072 // ----------------------------------------------------------------------------
1073
1074 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1075 {
1076     size_t len=0;
1077
1078     while (*(wxUint16*)psz && (!buf || len < n))
1079     {
1080         wxUint32 cc;
1081         size_t pa=decode_utf16((wxUint16*)psz, cc);
1082         if (pa == wxCONV_FAILED)
1083             return pa;
1084
1085         if (buf)
1086             *buf++ = (wchar_t)cc;
1087         len++;
1088         psz += pa * sizeof(wxUint16);
1089     }
1090     if (buf && len<n)   *buf=0;
1091
1092     return len;
1093 }
1094
1095
1096 // copy 32bit String to 16bit MB
1097 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1098 {
1099     size_t len=0;
1100
1101     while (*psz && (!buf || len < n))
1102     {
1103         wxUint16 cc[2];
1104         size_t pa=encode_utf16(*psz, cc);
1105
1106         if (pa == wxCONV_FAILED)
1107             return pa;
1108
1109         if (buf)
1110         {
1111             *(wxUint16*)buf = cc[0];
1112             buf += sizeof(wxUint16);
1113             if (pa > 1)
1114             {
1115                 *(wxUint16*)buf = cc[1];
1116                 buf += sizeof(wxUint16);
1117             }
1118         }
1119
1120         len += pa*sizeof(wxUint16);
1121         psz++;
1122     }
1123     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1124
1125     return len;
1126 }
1127
1128 // ----------------------------------------------------------------------------
1129 // endian-reversing conversions
1130 // ----------------------------------------------------------------------------
1131
1132 // swap 16bit MB to 32bit String
1133 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1134 {
1135     size_t len=0;
1136
1137     while (*(wxUint16*)psz && (!buf || len < n))
1138     {
1139         wxUint32 cc;
1140         char tmp[4];
1141         tmp[0]=psz[1];  tmp[1]=psz[0];
1142         tmp[2]=psz[3];  tmp[3]=psz[2];
1143
1144         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1145         if (pa == wxCONV_FAILED)
1146             return pa;
1147
1148         if (buf)
1149             *buf++ = (wchar_t)cc;
1150
1151         len++;
1152         psz += pa * sizeof(wxUint16);
1153     }
1154     if (buf && len<n)   *buf=0;
1155
1156     return len;
1157 }
1158
1159
1160 // swap 32bit String to 16bit MB
1161 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1162 {
1163     size_t len=0;
1164
1165     while (*psz && (!buf || len < n))
1166     {
1167         wxUint16 cc[2];
1168         size_t pa=encode_utf16(*psz, cc);
1169
1170         if (pa == wxCONV_FAILED)
1171             return pa;
1172
1173         if (buf)
1174         {
1175             *buf++ = ((char*)cc)[1];
1176             *buf++ = ((char*)cc)[0];
1177             if (pa > 1)
1178             {
1179                 *buf++ = ((char*)cc)[3];
1180                 *buf++ = ((char*)cc)[2];
1181             }
1182         }
1183
1184         len += pa*sizeof(wxUint16);
1185         psz++;
1186     }
1187     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1188
1189     return len;
1190 }
1191
1192 #endif // WC_UTF16/!WC_UTF16
1193
1194
1195 // ----------------------------------------------------------------------------
1196 // UTF-32
1197 // ----------------------------------------------------------------------------
1198
1199 #ifdef WORDS_BIGENDIAN
1200     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1201     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1202 #else
1203     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1204     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1205 #endif
1206
1207
1208 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1209 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1210
1211 /* static */
1212 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1213 {
1214     if ( srcLen == wxNO_LEN )
1215     {
1216         // count the number of bytes in input, including the trailing NULs
1217         const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1218         for ( srcLen = 1; *in++; srcLen++ )
1219             ;
1220
1221         srcLen *= BYTES_PER_CHAR;
1222     }
1223     else // we already have the length
1224     {
1225         // we can only convert an entire number of UTF-32 characters
1226         if ( srcLen % BYTES_PER_CHAR )
1227             return wxCONV_FAILED;
1228     }
1229
1230     return srcLen;
1231 }
1232
1233 // case when in-memory representation is UTF-16
1234 #ifdef WC_UTF16
1235
1236 // ----------------------------------------------------------------------------
1237 // conversions without endianness change
1238 // ----------------------------------------------------------------------------
1239
1240 size_t
1241 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1242                                const char *src, size_t srcLen) const
1243 {
1244     srcLen = GetLength(src, srcLen);
1245     if ( srcLen == wxNO_LEN )
1246         return wxCONV_FAILED;
1247
1248     const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1249     const size_t inLen = srcLen/BYTES_PER_CHAR;
1250     size_t outLen = 0;
1251     for ( size_t n = 0; n < inLen; n++ )
1252     {
1253         wxUint16 cc[2];
1254         const size_t numChars = encode_utf16(*in++, cc);
1255         if ( numChars == wxCONV_FAILED )
1256             return wxCONV_FAILED;
1257
1258         outLen += numChars;
1259         if ( dst )
1260         {
1261             if ( outLen > dstLen )
1262                 return wxCONV_FAILED;
1263
1264             *dst++ = cc[0];
1265             if ( numChars == 2 )
1266             {
1267                 // second character of a surrogate
1268                 *dst++ = cc[1];
1269             }
1270         }
1271     }
1272
1273     return outLen;
1274 }
1275
1276 size_t
1277 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1278                                  const wchar_t *src, size_t srcLen) const
1279 {
1280     if ( srcLen == wxNO_LEN )
1281         srcLen = wxWcslen(src) + 1;
1282
1283     if ( !dst )
1284     {
1285         // optimization: return maximal space which could be needed for this
1286         // string instead of the exact amount which could be less if there are
1287         // any surrogates in the input
1288         //
1289         // we consider that surrogates are rare enough to make it worthwhile to
1290         // avoid running the loop below at the cost of slightly extra memory
1291         // consumption
1292         return srcLen*BYTES_PER_CHAR;
1293     }
1294
1295     wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1296     size_t outLen = 0;
1297     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1298     {
1299         const wxUint32 ch = wxDecodeSurrogate(&src);
1300         if ( !src )
1301             return wxCONV_FAILED;
1302
1303         outLen += BYTES_PER_CHAR;
1304
1305         if ( outLen > dstLen )
1306             return wxCONV_FAILED;
1307
1308         *out++ = ch;
1309     }
1310
1311     return outLen;
1312 }
1313
1314 // ----------------------------------------------------------------------------
1315 // endian-reversing conversions
1316 // ----------------------------------------------------------------------------
1317
1318 size_t
1319 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1320                            const char *src, size_t srcLen) const
1321 {
1322     srcLen = GetLength(src, srcLen);
1323     if ( srcLen == wxNO_LEN )
1324         return wxCONV_FAILED;
1325
1326     const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1327     const size_t inLen = srcLen/BYTES_PER_CHAR;
1328     size_t outLen = 0;
1329     for ( size_t n = 0; n < inLen; n++, in++ )
1330     {
1331         wxUint16 cc[2];
1332         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*in), cc);
1333         if ( numChars == wxCONV_FAILED )
1334             return wxCONV_FAILED;
1335
1336         outLen += numChars;
1337         if ( dst )
1338         {
1339             if ( outLen > dstLen )
1340                 return wxCONV_FAILED;
1341
1342             *dst++ = cc[0];
1343             if ( numChars == 2 )
1344             {
1345                 // second character of a surrogate
1346                 *dst++ = cc[1];
1347             }
1348         }
1349     }
1350
1351     return outLen;
1352 }
1353
1354 size_t
1355 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1356                              const wchar_t *src, size_t srcLen) const
1357 {
1358     if ( srcLen == wxNO_LEN )
1359         srcLen = wxWcslen(src) + 1;
1360
1361     if ( !dst )
1362     {
1363         // optimization: return maximal space which could be needed for this
1364         // string instead of the exact amount which could be less if there are
1365         // any surrogates in the input
1366         //
1367         // we consider that surrogates are rare enough to make it worthwhile to
1368         // avoid running the loop below at the cost of slightly extra memory
1369         // consumption
1370         return srcLen*BYTES_PER_CHAR;
1371     }
1372
1373     wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1374     size_t outLen = 0;
1375     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1376     {
1377         const wxUint32 ch = wxDecodeSurrogate(&src);
1378         if ( !src )
1379             return wxCONV_FAILED;
1380
1381         outLen += BYTES_PER_CHAR;
1382
1383         if ( outLen > dstLen )
1384             return wxCONV_FAILED;
1385
1386         *out++ = wxUINT32_SWAP_ALWAYS(ch);
1387     }
1388
1389     return outLen;
1390 }
1391
1392 #else // !WC_UTF16: wchar_t is UTF-32
1393
1394 // copy 32bit MB to 32bit String
1395 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1396 {
1397     size_t len=0;
1398
1399     while (*(wxUint32*)psz && (!buf || len < n))
1400     {
1401         if (buf)
1402             *buf++ = (wchar_t)(*(wxUint32*)psz);
1403         len++;
1404         psz += sizeof(wxUint32);
1405     }
1406
1407     if (buf && len<n)
1408         *buf=0;
1409
1410     return len;
1411 }
1412
1413
1414 // copy 32bit String to 32bit MB
1415 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1416 {
1417     size_t len=0;
1418
1419     while (*psz && (!buf || len < n))
1420     {
1421         if (buf)
1422         {
1423             *(wxUint32*)buf = *psz;
1424             buf += sizeof(wxUint32);
1425         }
1426
1427         len += sizeof(wxUint32);
1428         psz++;
1429     }
1430
1431     if (buf && len<=n-sizeof(wxUint32))
1432         *(wxUint32*)buf=0;
1433
1434     return len;
1435 }
1436
1437
1438 // swap 32bit MB to 32bit String
1439 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1440 {
1441     size_t len=0;
1442
1443     while (*(wxUint32*)psz && (!buf || len < n))
1444     {
1445         if (buf)
1446         {
1447             ((char *)buf)[0] = psz[3];
1448             ((char *)buf)[1] = psz[2];
1449             ((char *)buf)[2] = psz[1];
1450             ((char *)buf)[3] = psz[0];
1451             buf++;
1452         }
1453         len++;
1454         psz += sizeof(wxUint32);
1455     }
1456
1457     if (buf && len<n)
1458         *buf=0;
1459
1460     return len;
1461 }
1462
1463
1464 // swap 32bit String to 32bit MB
1465 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1466 {
1467     size_t len=0;
1468
1469     while (*psz && (!buf || len < n))
1470     {
1471         if (buf)
1472         {
1473             *buf++ = ((char *)psz)[3];
1474             *buf++ = ((char *)psz)[2];
1475             *buf++ = ((char *)psz)[1];
1476             *buf++ = ((char *)psz)[0];
1477         }
1478         len += sizeof(wxUint32);
1479         psz++;
1480     }
1481
1482     if (buf && len<=n-sizeof(wxUint32))
1483         *(wxUint32*)buf=0;
1484
1485     return len;
1486 }
1487
1488
1489 #endif // WC_UTF16/!WC_UTF16
1490
1491
1492 // ============================================================================
1493 // The classes doing conversion using the iconv_xxx() functions
1494 // ============================================================================
1495
1496 #ifdef HAVE_ICONV
1497
1498 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1499 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1500 //     (unless there's yet another bug in glibc) the only case when iconv()
1501 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1502 //     left in the input buffer -- when _real_ error occurs,
1503 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1504 //     iconv() failure.
1505 //     [This bug does not appear in glibc 2.2.]
1506 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1507 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1508                                      (errno != E2BIG || bufLeft != 0))
1509 #else
1510 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1511 #endif
1512
1513 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1514
1515 #define ICONV_T_INVALID ((iconv_t)-1)
1516
1517 #if SIZEOF_WCHAR_T == 4
1518     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1519     #define WC_ENC      wxFONTENCODING_UTF32
1520 #elif SIZEOF_WCHAR_T == 2
1521     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1522     #define WC_ENC      wxFONTENCODING_UTF16
1523 #else // sizeof(wchar_t) != 2 nor 4
1524     // does this ever happen?
1525     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1526 #endif
1527
1528 // ----------------------------------------------------------------------------
1529 // wxMBConv_iconv: encapsulates an iconv character set
1530 // ----------------------------------------------------------------------------
1531
1532 class wxMBConv_iconv : public wxMBConv
1533 {
1534 public:
1535     wxMBConv_iconv(const wxChar *name);
1536     virtual ~wxMBConv_iconv();
1537
1538     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1539     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1540
1541     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1542     virtual size_t GetMBNulLen() const;
1543
1544     virtual wxMBConv *Clone() const
1545     {
1546         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1547         p->m_minMBCharWidth = m_minMBCharWidth;
1548         return p;
1549     }
1550
1551     bool IsOk() const
1552         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1553
1554 protected:
1555     // the iconv handlers used to translate from multibyte to wide char and in
1556     // the other direction
1557     iconv_t m2w,
1558             w2m;
1559 #if wxUSE_THREADS
1560     // guards access to m2w and w2m objects
1561     wxMutex m_iconvMutex;
1562 #endif
1563
1564 private:
1565     // the name (for iconv_open()) of a wide char charset -- if none is
1566     // available on this machine, it will remain NULL
1567     static wxString ms_wcCharsetName;
1568
1569     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1570     // different endian-ness than the native one
1571     static bool ms_wcNeedsSwap;
1572
1573
1574     // name of the encoding handled by this conversion
1575     wxString m_name;
1576
1577     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1578     // initially
1579     size_t m_minMBCharWidth;
1580 };
1581
1582 // make the constructor available for unit testing
1583 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1584 {
1585     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1586     if ( !result->IsOk() )
1587     {
1588         delete result;
1589         return 0;
1590     }
1591     return result;
1592 }
1593
1594 wxString wxMBConv_iconv::ms_wcCharsetName;
1595 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1596
1597 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1598               : m_name(name)
1599 {
1600     m_minMBCharWidth = 0;
1601
1602     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1603     // names for the charsets
1604     const wxCharBuffer cname(wxString(name).ToAscii());
1605
1606     // check for charset that represents wchar_t:
1607     if ( ms_wcCharsetName.empty() )
1608     {
1609         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1610
1611 #if wxUSE_FONTMAP
1612         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1613 #else // !wxUSE_FONTMAP
1614         static const wxChar *names[] =
1615         {
1616 #if SIZEOF_WCHAR_T == 4
1617             _T("UCS-4"),
1618 #elif SIZEOF_WCHAR_T = 2
1619             _T("UCS-2"),
1620 #endif
1621             NULL
1622         };
1623 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1624
1625         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1626         {
1627             const wxString nameCS(*names);
1628
1629             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1630             wxString nameXE(nameCS);
1631             #ifdef WORDS_BIGENDIAN
1632                 nameXE += _T("BE");
1633             #else // little endian
1634                 nameXE += _T("LE");
1635             #endif
1636
1637             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1638                        nameXE.c_str());
1639
1640             m2w = iconv_open(nameXE.ToAscii(), cname);
1641             if ( m2w == ICONV_T_INVALID )
1642             {
1643                 // try charset w/o bytesex info (e.g. "UCS4")
1644                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1645                            nameCS.c_str());
1646                 m2w = iconv_open(nameCS.ToAscii(), cname);
1647
1648                 // and check for bytesex ourselves:
1649                 if ( m2w != ICONV_T_INVALID )
1650                 {
1651                     char    buf[2], *bufPtr;
1652                     wchar_t wbuf[2], *wbufPtr;
1653                     size_t  insz, outsz;
1654                     size_t  res;
1655
1656                     buf[0] = 'A';
1657                     buf[1] = 0;
1658                     wbuf[0] = 0;
1659                     insz = 2;
1660                     outsz = SIZEOF_WCHAR_T * 2;
1661                     wbufPtr = wbuf;
1662                     bufPtr = buf;
1663
1664                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1665                                 (char**)&wbufPtr, &outsz);
1666
1667                     if (ICONV_FAILED(res, insz))
1668                     {
1669                         wxLogLastError(wxT("iconv"));
1670                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1671                                    nameCS.c_str());
1672                     }
1673                     else // ok, can convert to this encoding, remember it
1674                     {
1675                         ms_wcCharsetName = nameCS;
1676                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1677                     }
1678                 }
1679             }
1680             else // use charset not requiring byte swapping
1681             {
1682                 ms_wcCharsetName = nameXE;
1683             }
1684         }
1685
1686         wxLogTrace(TRACE_STRCONV,
1687                    wxT("iconv wchar_t charset is \"%s\"%s"),
1688                    ms_wcCharsetName.empty() ? _T("<none>")
1689                                             : ms_wcCharsetName.c_str(),
1690                    ms_wcNeedsSwap ? _T(" (needs swap)")
1691                                   : _T(""));
1692     }
1693     else // we already have ms_wcCharsetName
1694     {
1695         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1696     }
1697
1698     if ( ms_wcCharsetName.empty() )
1699     {
1700         w2m = ICONV_T_INVALID;
1701     }
1702     else
1703     {
1704         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1705         if ( w2m == ICONV_T_INVALID )
1706         {
1707             wxLogTrace(TRACE_STRCONV,
1708                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1709                        ms_wcCharsetName.c_str(), cname.data());
1710         }
1711     }
1712 }
1713
1714 wxMBConv_iconv::~wxMBConv_iconv()
1715 {
1716     if ( m2w != ICONV_T_INVALID )
1717         iconv_close(m2w);
1718     if ( w2m != ICONV_T_INVALID )
1719         iconv_close(w2m);
1720 }
1721
1722 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1723 {
1724     // find the string length: notice that must be done differently for
1725     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1726     size_t inbuf;
1727     const size_t nulLen = GetMBNulLen();
1728     switch ( nulLen )
1729     {
1730         default:
1731             return wxCONV_FAILED;
1732
1733         case 1:
1734             inbuf = strlen(psz); // arguably more optimized than our version
1735             break;
1736
1737         case 2:
1738         case 4:
1739             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1740             // they also have to start at character boundary and not span two
1741             // adjacent characters
1742             const char *p;
1743             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1744                 ;
1745             inbuf = p - psz;
1746             break;
1747     }
1748
1749 #if wxUSE_THREADS
1750     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1751     //     Unfortunately there is a couple of global wxCSConv objects such as
1752     //     wxConvLocal that are used all over wx code, so we have to make sure
1753     //     the handle is used by at most one thread at the time. Otherwise
1754     //     only a few wx classes would be safe to use from non-main threads
1755     //     as MB<->WC conversion would fail "randomly".
1756     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1757 #endif // wxUSE_THREADS
1758
1759
1760     size_t outbuf = n * SIZEOF_WCHAR_T;
1761     size_t res, cres;
1762     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1763     wchar_t *bufPtr = buf;
1764     const char *pszPtr = psz;
1765
1766     if (buf)
1767     {
1768         // have destination buffer, convert there
1769         cres = iconv(m2w,
1770                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1771                      (char**)&bufPtr, &outbuf);
1772         res = n - (outbuf / SIZEOF_WCHAR_T);
1773
1774         if (ms_wcNeedsSwap)
1775         {
1776             // convert to native endianness
1777             for ( unsigned i = 0; i < res; i++ )
1778                 buf[n] = WC_BSWAP(buf[i]);
1779         }
1780
1781         // NUL-terminate the string if there is any space left
1782         if (res < n)
1783             buf[res] = 0;
1784     }
1785     else
1786     {
1787         // no destination buffer... convert using temp buffer
1788         // to calculate destination buffer requirement
1789         wchar_t tbuf[8];
1790         res = 0;
1791         do {
1792             bufPtr = tbuf;
1793             outbuf = 8*SIZEOF_WCHAR_T;
1794
1795             cres = iconv(m2w,
1796                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1797                          (char**)&bufPtr, &outbuf );
1798
1799             res += 8-(outbuf/SIZEOF_WCHAR_T);
1800         } while ((cres==(size_t)-1) && (errno==E2BIG));
1801     }
1802
1803     if (ICONV_FAILED(cres, inbuf))
1804     {
1805         //VS: it is ok if iconv fails, hence trace only
1806         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1807         return wxCONV_FAILED;
1808     }
1809
1810     return res;
1811 }
1812
1813 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1814 {
1815 #if wxUSE_THREADS
1816     // NB: explained in MB2WC
1817     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1818 #endif
1819
1820     size_t inlen = wxWcslen(psz);
1821     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1822     size_t outbuf = n;
1823     size_t res, cres;
1824
1825     wchar_t *tmpbuf = 0;
1826
1827     if (ms_wcNeedsSwap)
1828     {
1829         // need to copy to temp buffer to switch endianness
1830         // (doing WC_BSWAP twice on the original buffer won't help, as it
1831         //  could be in read-only memory, or be accessed in some other thread)
1832         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1833         for ( size_t i = 0; i < inlen; i++ )
1834             tmpbuf[n] = WC_BSWAP(psz[i]);
1835         tmpbuf[inlen] = L'\0';
1836         psz = tmpbuf;
1837     }
1838
1839     if (buf)
1840     {
1841         // have destination buffer, convert there
1842         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1843
1844         res = n-outbuf;
1845
1846         // NB: iconv was given only wcslen(psz) characters on input, and so
1847         //     it couldn't convert the trailing zero. Let's do it ourselves
1848         //     if there's some room left for it in the output buffer.
1849         if (res < n)
1850             buf[0] = 0;
1851     }
1852     else
1853     {
1854         // no destination buffer... convert using temp buffer
1855         // to calculate destination buffer requirement
1856         char tbuf[16];
1857         res = 0;
1858         do {
1859             buf = tbuf; outbuf = 16;
1860
1861             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1862
1863             res += 16 - outbuf;
1864         } while ((cres==(size_t)-1) && (errno==E2BIG));
1865     }
1866
1867     if (ms_wcNeedsSwap)
1868     {
1869         free(tmpbuf);
1870     }
1871
1872     if (ICONV_FAILED(cres, inbuf))
1873     {
1874         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1875         return wxCONV_FAILED;
1876     }
1877
1878     return res;
1879 }
1880
1881 size_t wxMBConv_iconv::GetMBNulLen() const
1882 {
1883     if ( m_minMBCharWidth == 0 )
1884     {
1885         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1886
1887 #if wxUSE_THREADS
1888         // NB: explained in MB2WC
1889         wxMutexLocker lock(self->m_iconvMutex);
1890 #endif
1891
1892         wchar_t *wnul = L"";
1893         char buf[8]; // should be enough for NUL in any encoding
1894         size_t inLen = sizeof(wchar_t),
1895                outLen = WXSIZEOF(buf);
1896         char *in = (char *)wnul;
1897         char *out = buf;
1898         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1899         {
1900             self->m_minMBCharWidth = (size_t)-1;
1901         }
1902         else // ok
1903         {
1904             self->m_minMBCharWidth = out - buf;
1905         }
1906     }
1907
1908     return m_minMBCharWidth;
1909 }
1910
1911 #endif // HAVE_ICONV
1912
1913
1914 // ============================================================================
1915 // Win32 conversion classes
1916 // ============================================================================
1917
1918 #ifdef wxHAVE_WIN32_MB2WC
1919
1920 // from utils.cpp
1921 #if wxUSE_FONTMAP
1922 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1923 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1924 #endif
1925
1926 class wxMBConv_win32 : public wxMBConv
1927 {
1928 public:
1929     wxMBConv_win32()
1930     {
1931         m_CodePage = CP_ACP;
1932         m_minMBCharWidth = 0;
1933     }
1934
1935     wxMBConv_win32(const wxMBConv_win32& conv)
1936     {
1937         m_CodePage = conv.m_CodePage;
1938         m_minMBCharWidth = conv.m_minMBCharWidth;
1939     }
1940
1941 #if wxUSE_FONTMAP
1942     wxMBConv_win32(const wxChar* name)
1943     {
1944         m_CodePage = wxCharsetToCodepage(name);
1945         m_minMBCharWidth = 0;
1946     }
1947
1948     wxMBConv_win32(wxFontEncoding encoding)
1949     {
1950         m_CodePage = wxEncodingToCodepage(encoding);
1951         m_minMBCharWidth = 0;
1952     }
1953 #endif // wxUSE_FONTMAP
1954
1955     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1956     {
1957         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1958         // the behaviour is not compatible with the Unix version (using iconv)
1959         // and break the library itself, e.g. wxTextInputStream::NextChar()
1960         // wouldn't work if reading an incomplete MB char didn't result in an
1961         // error
1962         //
1963         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1964         // Win XP or newer and it is not supported for UTF-[78] so we always
1965         // use our own conversions in this case. See
1966         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1967         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1968         if ( m_CodePage == CP_UTF8 )
1969         {
1970             return wxConvUTF8.MB2WC(buf, psz, n);
1971         }
1972
1973         if ( m_CodePage == CP_UTF7 )
1974         {
1975             return wxConvUTF7.MB2WC(buf, psz, n);
1976         }
1977
1978         int flags = 0;
1979         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1980                 IsAtLeastWin2kSP4() )
1981         {
1982             flags = MB_ERR_INVALID_CHARS;
1983         }
1984
1985         const size_t len = ::MultiByteToWideChar
1986                              (
1987                                 m_CodePage,     // code page
1988                                 flags,          // flags: fall on error
1989                                 psz,            // input string
1990                                 -1,             // its length (NUL-terminated)
1991                                 buf,            // output string
1992                                 buf ? n : 0     // size of output buffer
1993                              );
1994         if ( !len )
1995         {
1996             // function totally failed
1997             return wxCONV_FAILED;
1998         }
1999
2000         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2001         // check if we succeeded, by doing a double trip:
2002         if ( !flags && buf )
2003         {
2004             const size_t mbLen = strlen(psz);
2005             wxCharBuffer mbBuf(mbLen);
2006             if ( ::WideCharToMultiByte
2007                    (
2008                       m_CodePage,
2009                       0,
2010                       buf,
2011                       -1,
2012                       mbBuf.data(),
2013                       mbLen + 1,        // size in bytes, not length
2014                       NULL,
2015                       NULL
2016                    ) == 0 ||
2017                   strcmp(mbBuf, psz) != 0 )
2018             {
2019                 // we didn't obtain the same thing we started from, hence
2020                 // the conversion was lossy and we consider that it failed
2021                 return wxCONV_FAILED;
2022             }
2023         }
2024
2025         // note that it returns count of written chars for buf != NULL and size
2026         // of the needed buffer for buf == NULL so in either case the length of
2027         // the string (which never includes the terminating NUL) is one less
2028         return len - 1;
2029     }
2030
2031     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2032     {
2033         /*
2034             we have a problem here: by default, WideCharToMultiByte() may
2035             replace characters unrepresentable in the target code page with bad
2036             quality approximations such as turning "1/2" symbol (U+00BD) into
2037             "1" for the code pages which don't have it and we, obviously, want
2038             to avoid this at any price
2039
2040             the trouble is that this function does it _silently_, i.e. it won't
2041             even tell us whether it did or not... Win98/2000 and higher provide
2042             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2043             we have to resort to a round trip, i.e. check that converting back
2044             results in the same string -- this is, of course, expensive but
2045             otherwise we simply can't be sure to not garble the data.
2046          */
2047
2048         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2049         // it doesn't work with CJK encodings (which we test for rather roughly
2050         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2051         // supporting it
2052         BOOL usedDef wxDUMMY_INITIALIZE(false);
2053         BOOL *pUsedDef;
2054         int flags;
2055         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2056         {
2057             // it's our lucky day
2058             flags = WC_NO_BEST_FIT_CHARS;
2059             pUsedDef = &usedDef;
2060         }
2061         else // old system or unsupported encoding
2062         {
2063             flags = 0;
2064             pUsedDef = NULL;
2065         }
2066
2067         const size_t len = ::WideCharToMultiByte
2068                              (
2069                                 m_CodePage,     // code page
2070                                 flags,          // either none or no best fit
2071                                 pwz,            // input string
2072                                 -1,             // it is (wide) NUL-terminated
2073                                 buf,            // output buffer
2074                                 buf ? n : 0,    // and its size
2075                                 NULL,           // default "replacement" char
2076                                 pUsedDef        // [out] was it used?
2077                              );
2078
2079         if ( !len )
2080         {
2081             // function totally failed
2082             return wxCONV_FAILED;
2083         }
2084
2085         // if we were really converting, check if we succeeded
2086         if ( buf )
2087         {
2088             if ( flags )
2089             {
2090                 // check if the conversion failed, i.e. if any replacements
2091                 // were done
2092                 if ( usedDef )
2093                     return wxCONV_FAILED;
2094             }
2095             else // we must resort to double tripping...
2096             {
2097                 wxWCharBuffer wcBuf(n);
2098                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2099                         wcscmp(wcBuf, pwz) != 0 )
2100                 {
2101                     // we didn't obtain the same thing we started from, hence
2102                     // the conversion was lossy and we consider that it failed
2103                     return wxCONV_FAILED;
2104                 }
2105             }
2106         }
2107
2108         // see the comment above for the reason of "len - 1"
2109         return len - 1;
2110     }
2111
2112     virtual size_t GetMBNulLen() const
2113     {
2114         if ( m_minMBCharWidth == 0 )
2115         {
2116             int len = ::WideCharToMultiByte
2117                         (
2118                             m_CodePage,     // code page
2119                             0,              // no flags
2120                             L"",            // input string
2121                             1,              // translate just the NUL
2122                             NULL,           // output buffer
2123                             0,              // and its size
2124                             NULL,           // no replacement char
2125                             NULL            // [out] don't care if it was used
2126                         );
2127
2128             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2129             switch ( len )
2130             {
2131                 default:
2132                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2133                     // fall through
2134
2135                 case 0:
2136                     self->m_minMBCharWidth = (size_t)-1;
2137                     break;
2138
2139                 case 1:
2140                 case 2:
2141                 case 4:
2142                     self->m_minMBCharWidth = len;
2143                     break;
2144             }
2145         }
2146
2147         return m_minMBCharWidth;
2148     }
2149
2150     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2151
2152     bool IsOk() const { return m_CodePage != -1; }
2153
2154 private:
2155     static bool CanUseNoBestFit()
2156     {
2157         static int s_isWin98Or2k = -1;
2158
2159         if ( s_isWin98Or2k == -1 )
2160         {
2161             int verMaj, verMin;
2162             switch ( wxGetOsVersion(&verMaj, &verMin) )
2163             {
2164                 case wxWIN95:
2165                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2166                     break;
2167
2168                 case wxWINDOWS_NT:
2169                     s_isWin98Or2k = verMaj >= 5;
2170                     break;
2171
2172                 default:
2173                     // unknown, be conseravtive by default
2174                     s_isWin98Or2k = 0;
2175             }
2176
2177             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2178         }
2179
2180         return s_isWin98Or2k == 1;
2181     }
2182
2183     static bool IsAtLeastWin2kSP4()
2184     {
2185 #ifdef __WXWINCE__
2186         return false;
2187 #else
2188         static int s_isAtLeastWin2kSP4 = -1;
2189
2190         if ( s_isAtLeastWin2kSP4 == -1 )
2191         {
2192             OSVERSIONINFOEX ver;
2193
2194             memset(&ver, 0, sizeof(ver));
2195             ver.dwOSVersionInfoSize = sizeof(ver);
2196             GetVersionEx((OSVERSIONINFO*)&ver);
2197
2198             s_isAtLeastWin2kSP4 =
2199               ((ver.dwMajorVersion > 5) || // Vista+
2200                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2201                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2202                ver.wServicePackMajor >= 4)) // 2000 SP4+
2203               ? 1 : 0;
2204         }
2205
2206         return s_isAtLeastWin2kSP4 == 1;
2207 #endif
2208     }
2209
2210
2211     // the code page we're working with
2212     long m_CodePage;
2213
2214     // cached result of GetMBNulLen(), set to 0 initially meaning
2215     // "unknown"
2216     size_t m_minMBCharWidth;
2217 };
2218
2219 #endif // wxHAVE_WIN32_MB2WC
2220
2221 // ============================================================================
2222 // Cocoa conversion classes
2223 // ============================================================================
2224
2225 #if defined(__WXCOCOA__)
2226
2227 // RN:  There is no UTF-32 support in either Core Foundation or
2228 // Cocoa.  Strangely enough, internally Core Foundation uses
2229 // UTF 32 internally quite a bit - its just not public (yet).
2230
2231 #include <CoreFoundation/CFString.h>
2232 #include <CoreFoundation/CFStringEncodingExt.h>
2233
2234 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2235 {
2236     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2237     if ( encoding == wxFONTENCODING_DEFAULT )
2238     {
2239         enc = CFStringGetSystemEncoding();
2240     }
2241     else switch( encoding)
2242     {
2243         case wxFONTENCODING_ISO8859_1 :
2244             enc = kCFStringEncodingISOLatin1 ;
2245             break ;
2246         case wxFONTENCODING_ISO8859_2 :
2247             enc = kCFStringEncodingISOLatin2;
2248             break ;
2249         case wxFONTENCODING_ISO8859_3 :
2250             enc = kCFStringEncodingISOLatin3 ;
2251             break ;
2252         case wxFONTENCODING_ISO8859_4 :
2253             enc = kCFStringEncodingISOLatin4;
2254             break ;
2255         case wxFONTENCODING_ISO8859_5 :
2256             enc = kCFStringEncodingISOLatinCyrillic;
2257             break ;
2258         case wxFONTENCODING_ISO8859_6 :
2259             enc = kCFStringEncodingISOLatinArabic;
2260             break ;
2261         case wxFONTENCODING_ISO8859_7 :
2262             enc = kCFStringEncodingISOLatinGreek;
2263             break ;
2264         case wxFONTENCODING_ISO8859_8 :
2265             enc = kCFStringEncodingISOLatinHebrew;
2266             break ;
2267         case wxFONTENCODING_ISO8859_9 :
2268             enc = kCFStringEncodingISOLatin5;
2269             break ;
2270         case wxFONTENCODING_ISO8859_10 :
2271             enc = kCFStringEncodingISOLatin6;
2272             break ;
2273         case wxFONTENCODING_ISO8859_11 :
2274             enc = kCFStringEncodingISOLatinThai;
2275             break ;
2276         case wxFONTENCODING_ISO8859_13 :
2277             enc = kCFStringEncodingISOLatin7;
2278             break ;
2279         case wxFONTENCODING_ISO8859_14 :
2280             enc = kCFStringEncodingISOLatin8;
2281             break ;
2282         case wxFONTENCODING_ISO8859_15 :
2283             enc = kCFStringEncodingISOLatin9;
2284             break ;
2285
2286         case wxFONTENCODING_KOI8 :
2287             enc = kCFStringEncodingKOI8_R;
2288             break ;
2289         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2290             enc = kCFStringEncodingDOSRussian;
2291             break ;
2292
2293 //      case wxFONTENCODING_BULGARIAN :
2294 //          enc = ;
2295 //          break ;
2296
2297         case wxFONTENCODING_CP437 :
2298             enc =kCFStringEncodingDOSLatinUS ;
2299             break ;
2300         case wxFONTENCODING_CP850 :
2301             enc = kCFStringEncodingDOSLatin1;
2302             break ;
2303         case wxFONTENCODING_CP852 :
2304             enc = kCFStringEncodingDOSLatin2;
2305             break ;
2306         case wxFONTENCODING_CP855 :
2307             enc = kCFStringEncodingDOSCyrillic;
2308             break ;
2309         case wxFONTENCODING_CP866 :
2310             enc =kCFStringEncodingDOSRussian ;
2311             break ;
2312         case wxFONTENCODING_CP874 :
2313             enc = kCFStringEncodingDOSThai;
2314             break ;
2315         case wxFONTENCODING_CP932 :
2316             enc = kCFStringEncodingDOSJapanese;
2317             break ;
2318         case wxFONTENCODING_CP936 :
2319             enc =kCFStringEncodingDOSChineseSimplif ;
2320             break ;
2321         case wxFONTENCODING_CP949 :
2322             enc = kCFStringEncodingDOSKorean;
2323             break ;
2324         case wxFONTENCODING_CP950 :
2325             enc = kCFStringEncodingDOSChineseTrad;
2326             break ;
2327         case wxFONTENCODING_CP1250 :
2328             enc = kCFStringEncodingWindowsLatin2;
2329             break ;
2330         case wxFONTENCODING_CP1251 :
2331             enc =kCFStringEncodingWindowsCyrillic ;
2332             break ;
2333         case wxFONTENCODING_CP1252 :
2334             enc =kCFStringEncodingWindowsLatin1 ;
2335             break ;
2336         case wxFONTENCODING_CP1253 :
2337             enc = kCFStringEncodingWindowsGreek;
2338             break ;
2339         case wxFONTENCODING_CP1254 :
2340             enc = kCFStringEncodingWindowsLatin5;
2341             break ;
2342         case wxFONTENCODING_CP1255 :
2343             enc =kCFStringEncodingWindowsHebrew ;
2344             break ;
2345         case wxFONTENCODING_CP1256 :
2346             enc =kCFStringEncodingWindowsArabic ;
2347             break ;
2348         case wxFONTENCODING_CP1257 :
2349             enc = kCFStringEncodingWindowsBalticRim;
2350             break ;
2351 //   This only really encodes to UTF7 (if that) evidently
2352 //        case wxFONTENCODING_UTF7 :
2353 //            enc = kCFStringEncodingNonLossyASCII ;
2354 //            break ;
2355         case wxFONTENCODING_UTF8 :
2356             enc = kCFStringEncodingUTF8 ;
2357             break ;
2358         case wxFONTENCODING_EUC_JP :
2359             enc = kCFStringEncodingEUC_JP;
2360             break ;
2361         case wxFONTENCODING_UTF16 :
2362             enc = kCFStringEncodingUnicode ;
2363             break ;
2364         case wxFONTENCODING_MACROMAN :
2365             enc = kCFStringEncodingMacRoman ;
2366             break ;
2367         case wxFONTENCODING_MACJAPANESE :
2368             enc = kCFStringEncodingMacJapanese ;
2369             break ;
2370         case wxFONTENCODING_MACCHINESETRAD :
2371             enc = kCFStringEncodingMacChineseTrad ;
2372             break ;
2373         case wxFONTENCODING_MACKOREAN :
2374             enc = kCFStringEncodingMacKorean ;
2375             break ;
2376         case wxFONTENCODING_MACARABIC :
2377             enc = kCFStringEncodingMacArabic ;
2378             break ;
2379         case wxFONTENCODING_MACHEBREW :
2380             enc = kCFStringEncodingMacHebrew ;
2381             break ;
2382         case wxFONTENCODING_MACGREEK :
2383             enc = kCFStringEncodingMacGreek ;
2384             break ;
2385         case wxFONTENCODING_MACCYRILLIC :
2386             enc = kCFStringEncodingMacCyrillic ;
2387             break ;
2388         case wxFONTENCODING_MACDEVANAGARI :
2389             enc = kCFStringEncodingMacDevanagari ;
2390             break ;
2391         case wxFONTENCODING_MACGURMUKHI :
2392             enc = kCFStringEncodingMacGurmukhi ;
2393             break ;
2394         case wxFONTENCODING_MACGUJARATI :
2395             enc = kCFStringEncodingMacGujarati ;
2396             break ;
2397         case wxFONTENCODING_MACORIYA :
2398             enc = kCFStringEncodingMacOriya ;
2399             break ;
2400         case wxFONTENCODING_MACBENGALI :
2401             enc = kCFStringEncodingMacBengali ;
2402             break ;
2403         case wxFONTENCODING_MACTAMIL :
2404             enc = kCFStringEncodingMacTamil ;
2405             break ;
2406         case wxFONTENCODING_MACTELUGU :
2407             enc = kCFStringEncodingMacTelugu ;
2408             break ;
2409         case wxFONTENCODING_MACKANNADA :
2410             enc = kCFStringEncodingMacKannada ;
2411             break ;
2412         case wxFONTENCODING_MACMALAJALAM :
2413             enc = kCFStringEncodingMacMalayalam ;
2414             break ;
2415         case wxFONTENCODING_MACSINHALESE :
2416             enc = kCFStringEncodingMacSinhalese ;
2417             break ;
2418         case wxFONTENCODING_MACBURMESE :
2419             enc = kCFStringEncodingMacBurmese ;
2420             break ;
2421         case wxFONTENCODING_MACKHMER :
2422             enc = kCFStringEncodingMacKhmer ;
2423             break ;
2424         case wxFONTENCODING_MACTHAI :
2425             enc = kCFStringEncodingMacThai ;
2426             break ;
2427         case wxFONTENCODING_MACLAOTIAN :
2428             enc = kCFStringEncodingMacLaotian ;
2429             break ;
2430         case wxFONTENCODING_MACGEORGIAN :
2431             enc = kCFStringEncodingMacGeorgian ;
2432             break ;
2433         case wxFONTENCODING_MACARMENIAN :
2434             enc = kCFStringEncodingMacArmenian ;
2435             break ;
2436         case wxFONTENCODING_MACCHINESESIMP :
2437             enc = kCFStringEncodingMacChineseSimp ;
2438             break ;
2439         case wxFONTENCODING_MACTIBETAN :
2440             enc = kCFStringEncodingMacTibetan ;
2441             break ;
2442         case wxFONTENCODING_MACMONGOLIAN :
2443             enc = kCFStringEncodingMacMongolian ;
2444             break ;
2445         case wxFONTENCODING_MACETHIOPIC :
2446             enc = kCFStringEncodingMacEthiopic ;
2447             break ;
2448         case wxFONTENCODING_MACCENTRALEUR :
2449             enc = kCFStringEncodingMacCentralEurRoman ;
2450             break ;
2451         case wxFONTENCODING_MACVIATNAMESE :
2452             enc = kCFStringEncodingMacVietnamese ;
2453             break ;
2454         case wxFONTENCODING_MACARABICEXT :
2455             enc = kCFStringEncodingMacExtArabic ;
2456             break ;
2457         case wxFONTENCODING_MACSYMBOL :
2458             enc = kCFStringEncodingMacSymbol ;
2459             break ;
2460         case wxFONTENCODING_MACDINGBATS :
2461             enc = kCFStringEncodingMacDingbats ;
2462             break ;
2463         case wxFONTENCODING_MACTURKISH :
2464             enc = kCFStringEncodingMacTurkish ;
2465             break ;
2466         case wxFONTENCODING_MACCROATIAN :
2467             enc = kCFStringEncodingMacCroatian ;
2468             break ;
2469         case wxFONTENCODING_MACICELANDIC :
2470             enc = kCFStringEncodingMacIcelandic ;
2471             break ;
2472         case wxFONTENCODING_MACROMANIAN :
2473             enc = kCFStringEncodingMacRomanian ;
2474             break ;
2475         case wxFONTENCODING_MACCELTIC :
2476             enc = kCFStringEncodingMacCeltic ;
2477             break ;
2478         case wxFONTENCODING_MACGAELIC :
2479             enc = kCFStringEncodingMacGaelic ;
2480             break ;
2481 //      case wxFONTENCODING_MACKEYBOARD :
2482 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2483 //          break ;
2484         default :
2485             // because gcc is picky
2486             break ;
2487     } ;
2488     return enc ;
2489 }
2490
2491 class wxMBConv_cocoa : public wxMBConv
2492 {
2493 public:
2494     wxMBConv_cocoa()
2495     {
2496         Init(CFStringGetSystemEncoding()) ;
2497     }
2498
2499     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2500     {
2501         m_encoding = conv.m_encoding;
2502     }
2503
2504 #if wxUSE_FONTMAP
2505     wxMBConv_cocoa(const wxChar* name)
2506     {
2507         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2508     }
2509 #endif
2510
2511     wxMBConv_cocoa(wxFontEncoding encoding)
2512     {
2513         Init( wxCFStringEncFromFontEnc(encoding) );
2514     }
2515
2516     ~wxMBConv_cocoa()
2517     {
2518     }
2519
2520     void Init( CFStringEncoding encoding)
2521     {
2522         m_encoding = encoding ;
2523     }
2524
2525     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2526     {
2527         wxASSERT(szUnConv);
2528
2529         CFStringRef theString = CFStringCreateWithBytes (
2530                                                 NULL, //the allocator
2531                                                 (const UInt8*)szUnConv,
2532                                                 strlen(szUnConv),
2533                                                 m_encoding,
2534                                                 false //no BOM/external representation
2535                                                 );
2536
2537         wxASSERT(theString);
2538
2539         size_t nOutLength = CFStringGetLength(theString);
2540
2541         if (szOut == NULL)
2542         {
2543             CFRelease(theString);
2544             return nOutLength;
2545         }
2546
2547         CFRange theRange = { 0, nOutSize };
2548
2549 #if SIZEOF_WCHAR_T == 4
2550         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2551 #endif
2552
2553         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2554
2555         CFRelease(theString);
2556
2557         szUniCharBuffer[nOutLength] = '\0' ;
2558
2559 #if SIZEOF_WCHAR_T == 4
2560         wxMBConvUTF16 converter ;
2561         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2562         delete[] szUniCharBuffer;
2563 #endif
2564
2565         return nOutLength;
2566     }
2567
2568     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2569     {
2570         wxASSERT(szUnConv);
2571
2572         size_t nRealOutSize;
2573         size_t nBufSize = wxWcslen(szUnConv);
2574         UniChar* szUniBuffer = (UniChar*) szUnConv;
2575
2576 #if SIZEOF_WCHAR_T == 4
2577         wxMBConvUTF16 converter ;
2578         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2579         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2580         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2581         nBufSize /= sizeof(UniChar);
2582 #endif
2583
2584         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2585                                 NULL, //allocator
2586                                 szUniBuffer,
2587                                 nBufSize,
2588                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2589                             );
2590
2591         wxASSERT(theString);
2592
2593         //Note that CER puts a BOM when converting to unicode
2594         //so we  check and use getchars instead in that case
2595         if (m_encoding == kCFStringEncodingUnicode)
2596         {
2597             if (szOut != NULL)
2598                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2599
2600             nRealOutSize = CFStringGetLength(theString) + 1;
2601         }
2602         else
2603         {
2604             CFStringGetBytes(
2605                 theString,
2606                 CFRangeMake(0, CFStringGetLength(theString)),
2607                 m_encoding,
2608                 0, //what to put in characters that can't be converted -
2609                     //0 tells CFString to return NULL if it meets such a character
2610                 false, //not an external representation
2611                 (UInt8*) szOut,
2612                 nOutSize,
2613                 (CFIndex*) &nRealOutSize
2614                         );
2615         }
2616
2617         CFRelease(theString);
2618
2619 #if SIZEOF_WCHAR_T == 4
2620         delete[] szUniBuffer;
2621 #endif
2622
2623         return  nRealOutSize - 1;
2624     }
2625
2626     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2627
2628     bool IsOk() const
2629     {
2630         return m_encoding != kCFStringEncodingInvalidId &&
2631               CFStringIsEncodingAvailable(m_encoding);
2632     }
2633
2634 private:
2635     CFStringEncoding m_encoding ;
2636 };
2637
2638 #endif // defined(__WXCOCOA__)
2639
2640 // ============================================================================
2641 // Mac conversion classes
2642 // ============================================================================
2643
2644 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2645
2646 class wxMBConv_mac : public wxMBConv
2647 {
2648 public:
2649     wxMBConv_mac()
2650     {
2651         Init(CFStringGetSystemEncoding()) ;
2652     }
2653
2654     wxMBConv_mac(const wxMBConv_mac& conv)
2655     {
2656         Init(conv.m_char_encoding);
2657     }
2658
2659 #if wxUSE_FONTMAP
2660     wxMBConv_mac(const wxChar* name)
2661     {
2662         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2663     }
2664 #endif
2665
2666     wxMBConv_mac(wxFontEncoding encoding)
2667     {
2668         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2669     }
2670
2671     ~wxMBConv_mac()
2672     {
2673         OSStatus status = noErr ;
2674         status = TECDisposeConverter(m_MB2WC_converter);
2675         status = TECDisposeConverter(m_WC2MB_converter);
2676     }
2677
2678
2679     void Init( TextEncodingBase encoding)
2680     {
2681         OSStatus status = noErr ;
2682         m_char_encoding = encoding ;
2683         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2684
2685         status = TECCreateConverter(&m_MB2WC_converter,
2686                                     m_char_encoding,
2687                                     m_unicode_encoding);
2688         status = TECCreateConverter(&m_WC2MB_converter,
2689                                     m_unicode_encoding,
2690                                     m_char_encoding);
2691     }
2692
2693     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2694     {
2695         OSStatus status = noErr ;
2696         ByteCount byteOutLen ;
2697         ByteCount byteInLen = strlen(psz) ;
2698         wchar_t *tbuf = NULL ;
2699         UniChar* ubuf = NULL ;
2700         size_t res = 0 ;
2701
2702         if (buf == NULL)
2703         {
2704             //apple specs say at least 32
2705             n = wxMax( 32 , byteInLen ) ;
2706             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2707         }
2708         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2709 #if SIZEOF_WCHAR_T == 4
2710         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2711 #else
2712         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2713 #endif
2714         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2715           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2716 #if SIZEOF_WCHAR_T == 4
2717         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2718         // is not properly terminated we get random characters at the end
2719         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2720         wxMBConvUTF16 converter ;
2721         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2722         free( ubuf ) ;
2723 #else
2724         res = byteOutLen / sizeof( UniChar ) ;
2725 #endif
2726         if ( buf == NULL )
2727              free(tbuf) ;
2728
2729         if ( buf  && res < n)
2730             buf[res] = 0;
2731
2732         return res ;
2733     }
2734
2735     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2736     {
2737         OSStatus status = noErr ;
2738         ByteCount byteOutLen ;
2739         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2740
2741         char *tbuf = NULL ;
2742
2743         if (buf == NULL)
2744         {
2745             //apple specs say at least 32
2746             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2747             tbuf = (char*) malloc( n ) ;
2748         }
2749
2750         ByteCount byteBufferLen = n ;
2751         UniChar* ubuf = NULL ;
2752 #if SIZEOF_WCHAR_T == 4
2753         wxMBConvUTF16 converter ;
2754         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2755         byteInLen = unicharlen ;
2756         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2757         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2758 #else
2759         ubuf = (UniChar*) psz ;
2760 #endif
2761         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2762             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2763 #if SIZEOF_WCHAR_T == 4
2764         free( ubuf ) ;
2765 #endif
2766         if ( buf == NULL )
2767             free(tbuf) ;
2768
2769         size_t res = byteOutLen ;
2770         if ( buf  && res < n)
2771         {
2772             buf[res] = 0;
2773
2774             //we need to double-trip to verify it didn't insert any ? in place
2775             //of bogus characters
2776             wxWCharBuffer wcBuf(n);
2777             size_t pszlen = wxWcslen(psz);
2778             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2779                         wxWcslen(wcBuf) != pszlen ||
2780                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2781             {
2782                 // we didn't obtain the same thing we started from, hence
2783                 // the conversion was lossy and we consider that it failed
2784                 return wxCONV_FAILED;
2785             }
2786         }
2787
2788         return res ;
2789     }
2790
2791     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2792
2793     bool IsOk() const
2794         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2795
2796 private:
2797     TECObjectRef m_MB2WC_converter ;
2798     TECObjectRef m_WC2MB_converter ;
2799
2800     TextEncodingBase m_char_encoding ;
2801     TextEncodingBase m_unicode_encoding ;
2802 };
2803
2804 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2805
2806 // ============================================================================
2807 // wxEncodingConverter based conversion classes
2808 // ============================================================================
2809
2810 #if wxUSE_FONTMAP
2811
2812 class wxMBConv_wxwin : public wxMBConv
2813 {
2814 private:
2815     void Init()
2816     {
2817         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2818                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2819     }
2820
2821 public:
2822     // temporarily just use wxEncodingConverter stuff,
2823     // so that it works while a better implementation is built
2824     wxMBConv_wxwin(const wxChar* name)
2825     {
2826         if (name)
2827             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2828         else
2829             m_enc = wxFONTENCODING_SYSTEM;
2830
2831         Init();
2832     }
2833
2834     wxMBConv_wxwin(wxFontEncoding enc)
2835     {
2836         m_enc = enc;
2837
2838         Init();
2839     }
2840
2841     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2842     {
2843         size_t inbuf = strlen(psz);
2844         if (buf)
2845         {
2846             if (!m2w.Convert(psz,buf))
2847                 return wxCONV_FAILED;
2848         }
2849         return inbuf;
2850     }
2851
2852     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2853     {
2854         const size_t inbuf = wxWcslen(psz);
2855         if (buf)
2856         {
2857             if (!w2m.Convert(psz,buf))
2858                 return wxCONV_FAILED;
2859         }
2860
2861         return inbuf;
2862     }
2863
2864     virtual size_t GetMBNulLen() const
2865     {
2866         switch ( m_enc )
2867         {
2868             case wxFONTENCODING_UTF16BE:
2869             case wxFONTENCODING_UTF16LE:
2870                 return 2;
2871
2872             case wxFONTENCODING_UTF32BE:
2873             case wxFONTENCODING_UTF32LE:
2874                 return 4;
2875
2876             default:
2877                 return 1;
2878         }
2879     }
2880
2881     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2882
2883     bool IsOk() const { return m_ok; }
2884
2885 public:
2886     wxFontEncoding m_enc;
2887     wxEncodingConverter m2w, w2m;
2888
2889 private:
2890     // were we initialized successfully?
2891     bool m_ok;
2892
2893     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2894 };
2895
2896 // make the constructors available for unit testing
2897 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2898 {
2899     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2900     if ( !result->IsOk() )
2901     {
2902         delete result;
2903         return 0;
2904     }
2905     return result;
2906 }
2907
2908 #endif // wxUSE_FONTMAP
2909
2910 // ============================================================================
2911 // wxCSConv implementation
2912 // ============================================================================
2913
2914 void wxCSConv::Init()
2915 {
2916     m_name = NULL;
2917     m_convReal =  NULL;
2918     m_deferred = true;
2919 }
2920
2921 wxCSConv::wxCSConv(const wxChar *charset)
2922 {
2923     Init();
2924
2925     if ( charset )
2926     {
2927         SetName(charset);
2928     }
2929
2930 #if wxUSE_FONTMAP
2931     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2932 #else
2933     m_encoding = wxFONTENCODING_SYSTEM;
2934 #endif
2935 }
2936
2937 wxCSConv::wxCSConv(wxFontEncoding encoding)
2938 {
2939     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2940     {
2941         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2942
2943         encoding = wxFONTENCODING_SYSTEM;
2944     }
2945
2946     Init();
2947
2948     m_encoding = encoding;
2949 }
2950
2951 wxCSConv::~wxCSConv()
2952 {
2953     Clear();
2954 }
2955
2956 wxCSConv::wxCSConv(const wxCSConv& conv)
2957         : wxMBConv()
2958 {
2959     Init();
2960
2961     SetName(conv.m_name);
2962     m_encoding = conv.m_encoding;
2963 }
2964
2965 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2966 {
2967     Clear();
2968
2969     SetName(conv.m_name);
2970     m_encoding = conv.m_encoding;
2971
2972     return *this;
2973 }
2974
2975 void wxCSConv::Clear()
2976 {
2977     free(m_name);
2978     delete m_convReal;
2979
2980     m_name = NULL;
2981     m_convReal = NULL;
2982 }
2983
2984 void wxCSConv::SetName(const wxChar *charset)
2985 {
2986     if (charset)
2987     {
2988         m_name = wxStrdup(charset);
2989         m_deferred = true;
2990     }
2991 }
2992
2993 #if wxUSE_FONTMAP
2994 #include "wx/hashmap.h"
2995
2996 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2997                      wxEncodingNameCache );
2998
2999 static wxEncodingNameCache gs_nameCache;
3000 #endif
3001
3002 wxMBConv *wxCSConv::DoCreate() const
3003 {
3004 #if wxUSE_FONTMAP
3005     wxLogTrace(TRACE_STRCONV,
3006                wxT("creating conversion for %s"),
3007                (m_name ? m_name
3008                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3009 #endif // wxUSE_FONTMAP
3010
3011     // check for the special case of ASCII or ISO8859-1 charset: as we have
3012     // special knowledge of it anyhow, we don't need to create a special
3013     // conversion object
3014     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3015             m_encoding == wxFONTENCODING_DEFAULT )
3016     {
3017         // don't convert at all
3018         return NULL;
3019     }
3020
3021     // we trust OS to do conversion better than we can so try external
3022     // conversion methods first
3023     //
3024     // the full order is:
3025     //      1. OS conversion (iconv() under Unix or Win32 API)
3026     //      2. hard coded conversions for UTF
3027     //      3. wxEncodingConverter as fall back
3028
3029     // step (1)
3030 #ifdef HAVE_ICONV
3031 #if !wxUSE_FONTMAP
3032     if ( m_name )
3033 #endif // !wxUSE_FONTMAP
3034     {
3035         wxString name(m_name);
3036         wxFontEncoding encoding(m_encoding);
3037
3038         if ( !name.empty() )
3039         {
3040             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3041             if ( conv->IsOk() )
3042                 return conv;
3043
3044             delete conv;
3045
3046 #if wxUSE_FONTMAP
3047             encoding =
3048                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3049 #endif // wxUSE_FONTMAP
3050         }
3051 #if wxUSE_FONTMAP
3052         {
3053             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3054             if ( it != gs_nameCache.end() )
3055             {
3056                 if ( it->second.empty() )
3057                     return NULL;
3058
3059                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3060                 if ( conv->IsOk() )
3061                     return conv;
3062
3063                 delete conv;
3064             }
3065
3066             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3067
3068             for ( ; *names; ++names )
3069             {
3070                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3071                 if ( conv->IsOk() )
3072                 {
3073                     gs_nameCache[encoding] = *names;
3074                     return conv;
3075                 }
3076
3077                 delete conv;
3078             }
3079
3080             gs_nameCache[encoding] = _T(""); // cache the failure
3081         }
3082 #endif // wxUSE_FONTMAP
3083     }
3084 #endif // HAVE_ICONV
3085
3086 #ifdef wxHAVE_WIN32_MB2WC
3087     {
3088 #if wxUSE_FONTMAP
3089         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3090                                       : new wxMBConv_win32(m_encoding);
3091         if ( conv->IsOk() )
3092             return conv;
3093
3094         delete conv;
3095 #else
3096         return NULL;
3097 #endif
3098     }
3099 #endif // wxHAVE_WIN32_MB2WC
3100 #if defined(__WXMAC__)
3101     {
3102         // leave UTF16 and UTF32 to the built-ins of wx
3103         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3104             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3105         {
3106
3107 #if wxUSE_FONTMAP
3108             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3109                                         : new wxMBConv_mac(m_encoding);
3110 #else
3111             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3112 #endif
3113             if ( conv->IsOk() )
3114                  return conv;
3115
3116             delete conv;
3117         }
3118     }
3119 #endif
3120 #if defined(__WXCOCOA__)
3121     {
3122         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3123         {
3124
3125 #if wxUSE_FONTMAP
3126             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3127                                           : new wxMBConv_cocoa(m_encoding);
3128 #else
3129             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3130 #endif
3131             if ( conv->IsOk() )
3132                  return conv;
3133
3134             delete conv;
3135         }
3136     }
3137 #endif
3138     // step (2)
3139     wxFontEncoding enc = m_encoding;
3140 #if wxUSE_FONTMAP
3141     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3142     {
3143         // use "false" to suppress interactive dialogs -- we can be called from
3144         // anywhere and popping up a dialog from here is the last thing we want to
3145         // do
3146         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3147     }
3148 #endif // wxUSE_FONTMAP
3149
3150     switch ( enc )
3151     {
3152         case wxFONTENCODING_UTF7:
3153              return new wxMBConvUTF7;
3154
3155         case wxFONTENCODING_UTF8:
3156              return new wxMBConvUTF8;
3157
3158         case wxFONTENCODING_UTF16BE:
3159              return new wxMBConvUTF16BE;
3160
3161         case wxFONTENCODING_UTF16LE:
3162              return new wxMBConvUTF16LE;
3163
3164         case wxFONTENCODING_UTF32BE:
3165              return new wxMBConvUTF32BE;
3166
3167         case wxFONTENCODING_UTF32LE:
3168              return new wxMBConvUTF32LE;
3169
3170         default:
3171              // nothing to do but put here to suppress gcc warnings
3172              ;
3173     }
3174
3175     // step (3)
3176 #if wxUSE_FONTMAP
3177     {
3178         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3179                                       : new wxMBConv_wxwin(m_encoding);
3180         if ( conv->IsOk() )
3181             return conv;
3182
3183         delete conv;
3184     }
3185 #endif // wxUSE_FONTMAP
3186
3187     // NB: This is a hack to prevent deadlock. What could otherwise happen
3188     //     in Unicode build: wxConvLocal creation ends up being here
3189     //     because of some failure and logs the error. But wxLog will try to
3190     //     attach timestamp, for which it will need wxConvLocal (to convert
3191     //     time to char* and then wchar_t*), but that fails, tries to log
3192     //     error, but wxLog has a (already locked) critical section that
3193     //     guards static buffer.
3194     static bool alreadyLoggingError = false;
3195     if (!alreadyLoggingError)
3196     {
3197         alreadyLoggingError = true;
3198         wxLogError(_("Cannot convert from the charset '%s'!"),
3199                    m_name ? m_name
3200                       :
3201 #if wxUSE_FONTMAP
3202                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3203 #else // !wxUSE_FONTMAP
3204                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3205 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3206               );
3207         alreadyLoggingError = false;
3208     }
3209
3210     return NULL;
3211 }
3212
3213 void wxCSConv::CreateConvIfNeeded() const
3214 {
3215     if ( m_deferred )
3216     {
3217         wxCSConv *self = (wxCSConv *)this; // const_cast
3218
3219 #if wxUSE_INTL
3220         // if we don't have neither the name nor the encoding, use the default
3221         // encoding for this system
3222         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3223         {
3224             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3225         }
3226 #endif // wxUSE_INTL
3227
3228         self->m_convReal = DoCreate();
3229         self->m_deferred = false;
3230     }
3231 }
3232
3233 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3234 {
3235     CreateConvIfNeeded();
3236
3237     if (m_convReal)
3238         return m_convReal->MB2WC(buf, psz, n);
3239
3240     // latin-1 (direct)
3241     size_t len = strlen(psz);
3242
3243     if (buf)
3244     {
3245         for (size_t c = 0; c <= len; c++)
3246             buf[c] = (unsigned char)(psz[c]);
3247     }
3248
3249     return len;
3250 }
3251
3252 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3253 {
3254     CreateConvIfNeeded();
3255
3256     if (m_convReal)
3257         return m_convReal->WC2MB(buf, psz, n);
3258
3259     // latin-1 (direct)
3260     const size_t len = wxWcslen(psz);
3261     if (buf)
3262     {
3263         for (size_t c = 0; c <= len; c++)
3264         {
3265             if (psz[c] > 0xFF)
3266                 return wxCONV_FAILED;
3267             buf[c] = (char)psz[c];
3268         }
3269     }
3270     else
3271     {
3272         for (size_t c = 0; c <= len; c++)
3273         {
3274             if (psz[c] > 0xFF)
3275                 return wxCONV_FAILED;
3276         }
3277     }
3278
3279     return len;
3280 }
3281
3282 size_t wxCSConv::GetMBNulLen() const
3283 {
3284     CreateConvIfNeeded();
3285
3286     if ( m_convReal )
3287     {
3288         return m_convReal->GetMBNulLen();
3289     }
3290
3291     return 1;
3292 }
3293
3294 // ----------------------------------------------------------------------------
3295 // globals
3296 // ----------------------------------------------------------------------------
3297
3298 #ifdef __WINDOWS__
3299     static wxMBConv_win32 wxConvLibcObj;
3300 #elif defined(__WXMAC__) && !defined(__MACH__)
3301     static wxMBConv_mac wxConvLibcObj ;
3302 #else
3303     static wxMBConvLibc wxConvLibcObj;
3304 #endif
3305
3306 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3307 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3308 static wxMBConvUTF7 wxConvUTF7Obj;
3309 static wxMBConvUTF8 wxConvUTF8Obj;
3310
3311 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3312 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3313 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3314 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3315 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3316 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3317 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3318 #ifdef __WXOSX__
3319                                     wxConvUTF8Obj;
3320 #else
3321                                     wxConvLibcObj;
3322 #endif
3323
3324
3325 #else // !wxUSE_WCHAR_T
3326
3327 // stand-ins in absence of wchar_t
3328 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3329                                 wxConvISO8859_1,
3330                                 wxConvLocal,
3331                                 wxConvUTF8;
3332
3333 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T