src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include "wx/mac/corefoundation/private/strconv_cf.h"
  61 #endif //def __DARWIN__
  62
  63 #ifdef __WXMAC__
  64 #ifndef __DARWIN__
  65 #include <ATSUnicode.h>
  66 #include <TextCommon.h>
  67 #include <TextEncodingConverter.h>
  68 #endif
  69
  70 // includes Mac headers
  71 #include "wx/mac/private.h"
  72 #endif
  73
  74
  75 #define TRACE_STRCONV _T("strconv")
  76
  77 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  78 // be 4 bytes
  79 #if SIZEOF_WCHAR_T == 2
  80     #define WC_UTF16
  81 #endif
  82
  83
  84 // ============================================================================
  85 // implementation
  86 // ============================================================================
  87
  88 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  89 static bool NotAllNULs(const char *p, size_t n)
  90 {
  91     while ( n && *p++ == '\0' )
  92         n--;
  93
  94     return n != 0;
  95 }
  96
  97 // ----------------------------------------------------------------------------
  98 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  99 // ----------------------------------------------------------------------------
 100
 101 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 102 {
 103     if (input <= 0xffff)
 104     {
 105         if (output)
 106             *output = (wxUint16) input;
 107
 108         return 1;
 109     }
 110     else if (input >= 0x110000)
 111     {
 112         return wxCONV_FAILED;
 113     }
 114     else
 115     {
 116         if (output)
 117         {
 118             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 119             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 120         }
 121
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input < 0xd800) || (*input > 0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 134     {
 135         output = *input;
 136         return wxCONV_FAILED;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145 #ifdef WC_UTF16
 146     typedef wchar_t wxDecodeSurrogate_t;
 147 #else // !WC_UTF16
 148     typedef wxUint16 wxDecodeSurrogate_t;
 149 #endif // WC_UTF16/!WC_UTF16
 150
 151 // returns the next UTF-32 character from the wchar_t buffer and advances the
 152 // pointer to the character after this one
 153 //
 154 // if an invalid character is found, *pSrc is set to NULL, the caller must
 155 // check for this
 156 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 157 {
 158     wxUint32 out;
 159     const size_t
 160         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 161     if ( n == wxCONV_FAILED )
 162         *pSrc = NULL;
 163     else
 164         *pSrc += n;
 165
 166     return out;
 167 }
 168
 169 // ----------------------------------------------------------------------------
 170 // wxMBConv
 171 // ----------------------------------------------------------------------------
 172
 173 size_t
 174 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 175                   const char *src, size_t srcLen) const
 176 {
 177     // although new conversion classes are supposed to implement this function
 178     // directly, the existins ones only implement the old MB2WC() and so, to
 179     // avoid to have to rewrite all conversion classes at once, we provide a
 180     // default (but not efficient) implementation of this one in terms of the
 181     // old function by copying the input to ensure that it's NUL-terminated and
 182     // then using MB2WC() to convert it
 183
 184     // the number of chars [which would be] written to dst [if it were not NULL]
 185     size_t dstWritten = 0;
 186
 187     // the number of NULs terminating this string
 188     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 189
 190     // if we were not given the input size we just have to assume that the
 191     // string is properly terminated as we have no way of knowing how long it
 192     // is anyhow, but if we do have the size check whether there are enough
 193     // NULs at the end
 194     wxCharBuffer bufTmp;
 195     const char *srcEnd;
 196     if ( srcLen != wxNO_LEN )
 197     {
 198         // we need to know how to find the end of this string
 199         nulLen = GetMBNulLen();
 200         if ( nulLen == wxCONV_FAILED )
 201             return wxCONV_FAILED;
 202
 203         // if there are enough NULs we can avoid the copy
 204         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 205         {
 206             // make a copy in order to properly NUL-terminate the string
 207             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 208             char * const p = bufTmp.data();
 209             memcpy(p, src, srcLen);
 210             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 211                 *s = '\0';
 212
 213             src = bufTmp;
 214         }
 215
 216         srcEnd = src + srcLen;
 217     }
 218     else // quit after the first loop iteration
 219     {
 220         srcEnd = NULL;
 221     }
 222
 223     for ( ;; )
 224     {
 225         // try to convert the current chunk
 226         size_t lenChunk = MB2WC(NULL, src, 0);
 227         if ( lenChunk == wxCONV_FAILED )
 228             return wxCONV_FAILED;
 229
 230         lenChunk++; // for the L'\0' at the end of this chunk
 231
 232         dstWritten += lenChunk;
 233
 234         if ( lenChunk == 1 )
 235         {
 236             // nothing left in the input string, conversion succeeded
 237             break;
 238         }
 239
 240         if ( dst )
 241         {
 242             if ( dstWritten > dstLen )
 243                 return wxCONV_FAILED;
 244
 245             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 246                 return wxCONV_FAILED;
 247
 248             dst += lenChunk;
 249         }
 250
 251         if ( !srcEnd )
 252         {
 253             // we convert just one chunk in this case as this is the entire
 254             // string anyhow
 255             break;
 256         }
 257
 258         // advance the input pointer past the end of this chunk
 259         while ( NotAllNULs(src, nulLen) )
 260         {
 261             // notice that we must skip over multiple bytes here as we suppose
 262             // that if NUL takes 2 or 4 bytes, then all the other characters do
 263             // too and so if advanced by a single byte we might erroneously
 264             // detect sequences of NUL bytes in the middle of the input
 265             src += nulLen;
 266         }
 267
 268         src += nulLen; // skipping over its terminator as well
 269
 270         // note that ">=" (and not just "==") is needed here as the terminator
 271         // we skipped just above could be inside or just after the buffer
 272         // delimited by inEnd
 273         if ( src >= srcEnd )
 274             break;
 275     }
 276
 277     return dstWritten;
 278 }
 279
 280 size_t
 281 wxMBConv::FromWChar(char *dst, size_t dstLen,
 282                     const wchar_t *src, size_t srcLen) const
 283 {
 284     // the number of chars [which would be] written to dst [if it were not NULL]
 285     size_t dstWritten = 0;
 286
 287     // make a copy of the input string unless it is already properly
 288     // NUL-terminated
 289     //
 290     // if we don't know its length we have no choice but to assume that it is,
 291     // indeed, properly terminated
 292     wxWCharBuffer bufTmp;
 293     if ( srcLen == wxNO_LEN )
 294     {
 295         srcLen = wxWcslen(src) + 1;
 296     }
 297     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 298     {
 299         // make a copy in order to properly NUL-terminate the string
 300         bufTmp = wxWCharBuffer(srcLen);
 301         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 302         src = bufTmp;
 303     }
 304
 305     const size_t lenNul = GetMBNulLen();
 306     for ( const wchar_t * const srcEnd = src + srcLen;
 307           src < srcEnd;
 308           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 309     {
 310         // try to convert the current chunk
 311         size_t lenChunk = WC2MB(NULL, src, 0);
 312
 313         if ( lenChunk == wxCONV_FAILED )
 314             return wxCONV_FAILED;
 315
 316         lenChunk += lenNul;
 317         dstWritten += lenChunk;
 318
 319         if ( dst )
 320         {
 321             if ( dstWritten > dstLen )
 322                 return wxCONV_FAILED;
 323
 324             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 325                 return wxCONV_FAILED;
 326
 327             dst += lenChunk;
 328         }
 329     }
 330
 331     return dstWritten;
 332 }
 333
 334 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 335 {
 336     size_t rc = ToWChar(outBuff, outLen, inBuff);
 337     if ( rc != wxCONV_FAILED )
 338     {
 339         // ToWChar() returns the buffer length, i.e. including the trailing
 340         // NUL, while this method doesn't take it into account
 341         rc--;
 342     }
 343
 344     return rc;
 345 }
 346
 347 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 348 {
 349     size_t rc = FromWChar(outBuff, outLen, inBuff);
 350     if ( rc != wxCONV_FAILED )
 351     {
 352         rc -= GetMBNulLen();
 353     }
 354
 355     return rc;
 356 }
 357
 358 wxMBConv::~wxMBConv()
 359 {
 360     // nothing to do here (necessary for Darwin linking probably)
 361 }
 362
 363 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 364 {
 365     if ( psz )
 366     {
 367         // calculate the length of the buffer needed first
 368         const size_t nLen = MB2WC(NULL, psz, 0);
 369         if ( nLen != wxCONV_FAILED )
 370         {
 371             // now do the actual conversion
 372             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 373
 374             // +1 for the trailing NULL
 375             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 376                 return buf;
 377         }
 378     }
 379
 380     return wxWCharBuffer();
 381 }
 382
 383 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 384 {
 385     if ( pwz )
 386     {
 387         const size_t nLen = WC2MB(NULL, pwz, 0);
 388         if ( nLen != wxCONV_FAILED )
 389         {
 390             // extra space for trailing NUL(s)
 391             static const size_t extraLen = GetMaxMBNulLen();
 392
 393             wxCharBuffer buf(nLen + extraLen - 1);
 394             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 395                 return buf;
 396         }
 397     }
 398
 399     return wxCharBuffer();
 400 }
 401
 402 const wxWCharBuffer
 403 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 404 {
 405     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 406     if ( dstLen != wxCONV_FAILED )
 407     {
 408         wxWCharBuffer wbuf(dstLen - 1);
 409         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 410         {
 411             if ( outLen )
 412             {
 413                 *outLen = dstLen;
 414                 if ( wbuf[dstLen - 1] == L'\0' )
 415                     (*outLen)--;
 416             }
 417
 418             return wbuf;
 419         }
 420     }
 421
 422     if ( outLen )
 423         *outLen = 0;
 424
 425     return wxWCharBuffer();
 426 }
 427
 428 const wxCharBuffer
 429 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 430 {
 431     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 432     if ( dstLen != wxCONV_FAILED )
 433     {
 434         // special case of empty input: can't allocate 0 size buffer below as
 435         // wxCharBuffer insists on NUL-terminating it
 436         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 437         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 438         {
 439             if ( outLen )
 440             {
 441                 *outLen = dstLen;
 442
 443                 const size_t nulLen = GetMBNulLen();
 444                 if ( dstLen >= nulLen &&
 445                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 446                 {
 447                     // in this case the output is NUL-terminated and we're not
 448                     // supposed to count NUL
 449                     *outLen -= nulLen;
 450                 }
 451             }
 452
 453             return buf;
 454         }
 455     }
 456
 457     if ( outLen )
 458         *outLen = 0;
 459
 460     return wxCharBuffer();
 461 }
 462
 463 // ----------------------------------------------------------------------------
 464 // wxMBConvLibc
 465 // ----------------------------------------------------------------------------
 466
 467 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 468 {
 469     return wxMB2WC(buf, psz, n);
 470 }
 471
 472 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 473 {
 474     return wxWC2MB(buf, psz, n);
 475 }
 476
 477 // ----------------------------------------------------------------------------
 478 // wxConvBrokenFileNames
 479 // ----------------------------------------------------------------------------
 480
 481 #ifdef __UNIX__
 482
 483 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 484 {
 485     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 486          wxStricmp(charset, _T("UTF8")) == 0  )
 487         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 488     else
 489         m_conv = new wxCSConv(charset);
 490 }
 491
 492 #endif // __UNIX__
 493
 494 // ----------------------------------------------------------------------------
 495 // UTF-7
 496 // ----------------------------------------------------------------------------
 497
 498 // Implementation (C) 2004 Fredrik Roubert
 499
 500 //
 501 // BASE64 decoding table
 502 //
 503 static const unsigned char utf7unb64[] =
 504 {
 505     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 506     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 511     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 512     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 514     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 515     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 516     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 518     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 519     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 520     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 533     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 534     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 535     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 536     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 537 };
 538
 539 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 540 {
 541     size_t len = 0;
 542
 543     while ( *psz && (!buf || (len < n)) )
 544     {
 545         unsigned char cc = *psz++;
 546         if (cc != '+')
 547         {
 548             // plain ASCII char
 549             if (buf)
 550                 *buf++ = cc;
 551             len++;
 552         }
 553         else if (*psz == '-')
 554         {
 555             // encoded plus sign
 556             if (buf)
 557                 *buf++ = cc;
 558             len++;
 559             psz++;
 560         }
 561         else // start of BASE64 encoded string
 562         {
 563             bool lsb, ok;
 564             unsigned int d, l;
 565             for ( ok = lsb = false, d = 0, l = 0;
 566                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 567                   psz++ )
 568             {
 569                 d <<= 6;
 570                 d += cc;
 571                 for (l += 6; l >= 8; lsb = !lsb)
 572                 {
 573                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 574                     if (lsb)
 575                     {
 576                         if (buf)
 577                             *buf++ |= c;
 578                         len ++;
 579                     }
 580                     else
 581                     {
 582                         if (buf)
 583                             *buf = (wchar_t)(c << 8);
 584                     }
 585
 586                     ok = true;
 587                 }
 588             }
 589
 590             if ( !ok )
 591             {
 592                 // in valid UTF7 we should have valid characters after '+'
 593                 return wxCONV_FAILED;
 594             }
 595
 596             if (*psz == '-')
 597                 psz++;
 598         }
 599     }
 600
 601     if ( buf && (len < n) )
 602         *buf = '\0';
 603
 604     return len;
 605 }
 606
 607 //
 608 // BASE64 encoding table
 609 //
 610 static const unsigned char utf7enb64[] =
 611 {
 612     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 613     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 614     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 615     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 616     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 617     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 618     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 619     '4', '5', '6', '7', '8', '9', '+', '/'
 620 };
 621
 622 //
 623 // UTF-7 encoding table
 624 //
 625 // 0 - Set D (directly encoded characters)
 626 // 1 - Set O (optional direct characters)
 627 // 2 - whitespace characters (optional)
 628 // 3 - special characters
 629 //
 630 static const unsigned char utf7encode[128] =
 631 {
 632     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 633     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 634     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 635     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 636     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 637     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 638     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 639     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 640 };
 641
 642 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 643 {
 644     size_t len = 0;
 645
 646     while (*psz && ((!buf) || (len < n)))
 647     {
 648         wchar_t cc = *psz++;
 649         if (cc < 0x80 && utf7encode[cc] < 1)
 650         {
 651             // plain ASCII char
 652             if (buf)
 653                 *buf++ = (char)cc;
 654
 655             len++;
 656         }
 657 #ifndef WC_UTF16
 658         else if (((wxUint32)cc) > 0xffff)
 659         {
 660             // no surrogate pair generation (yet?)
 661             return wxCONV_FAILED;
 662         }
 663 #endif
 664         else
 665         {
 666             if (buf)
 667                 *buf++ = '+';
 668
 669             len++;
 670             if (cc != '+')
 671             {
 672                 // BASE64 encode string
 673                 unsigned int lsb, d, l;
 674                 for (d = 0, l = 0; /*nothing*/; psz++)
 675                 {
 676                     for (lsb = 0; lsb < 2; lsb ++)
 677                     {
 678                         d <<= 8;
 679                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 680
 681                         for (l += 8; l >= 6; )
 682                         {
 683                             l -= 6;
 684                             if (buf)
 685                                 *buf++ = utf7enb64[(d >> l) % 64];
 686                             len++;
 687                         }
 688                     }
 689
 690                     cc = *psz;
 691                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 692                         break;
 693                 }
 694
 695                 if (l != 0)
 696                 {
 697                     if (buf)
 698                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 699
 700                     len++;
 701                 }
 702             }
 703
 704             if (buf)
 705                 *buf++ = '-';
 706             len++;
 707         }
 708     }
 709
 710     if (buf && (len < n))
 711         *buf = 0;
 712
 713     return len;
 714 }
 715
 716 // ----------------------------------------------------------------------------
 717 // UTF-8
 718 // ----------------------------------------------------------------------------
 719
 720 static wxUint32 utf8_max[]=
 721     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 722
 723 // boundaries of the private use area we use to (temporarily) remap invalid
 724 // characters invalid in a UTF-8 encoded string
 725 const wxUint32 wxUnicodePUA = 0x100000;
 726 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 727
 728 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 729 {
 730     size_t len = 0;
 731
 732     while (*psz && ((!buf) || (len < n)))
 733     {
 734         const char *opsz = psz;
 735         bool invalid = false;
 736         unsigned char cc = *psz++, fc = cc;
 737         unsigned cnt;
 738         for (cnt = 0; fc & 0x80; cnt++)
 739             fc <<= 1;
 740
 741         if (!cnt)
 742         {
 743             // plain ASCII char
 744             if (buf)
 745                 *buf++ = cc;
 746             len++;
 747
 748             // escape the escape character for octal escapes
 749             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 750                     && cc == '\\' && (!buf || len < n))
 751             {
 752                 if (buf)
 753                     *buf++ = cc;
 754                 len++;
 755             }
 756         }
 757         else
 758         {
 759             cnt--;
 760             if (!cnt)
 761             {
 762                 // invalid UTF-8 sequence
 763                 invalid = true;
 764             }
 765             else
 766             {
 767                 unsigned ocnt = cnt - 1;
 768                 wxUint32 res = cc & (0x3f >> cnt);
 769                 while (cnt--)
 770                 {
 771                     cc = *psz;
 772                     if ((cc & 0xC0) != 0x80)
 773                     {
 774                         // invalid UTF-8 sequence
 775                         invalid = true;
 776                         break;
 777                     }
 778
 779                     psz++;
 780                     res = (res << 6) | (cc & 0x3f);
 781                 }
 782
 783                 if (invalid || res <= utf8_max[ocnt])
 784                 {
 785                     // illegal UTF-8 encoding
 786                     invalid = true;
 787                 }
 788                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 789                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 790                 {
 791                     // if one of our PUA characters turns up externally
 792                     // it must also be treated as an illegal sequence
 793                     // (a bit like you have to escape an escape character)
 794                     invalid = true;
 795                 }
 796                 else
 797                 {
 798 #ifdef WC_UTF16
 799                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 800                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 801                     if (pa == wxCONV_FAILED)
 802                     {
 803                         invalid = true;
 804                     }
 805                     else
 806                     {
 807                         if (buf)
 808                             buf += pa;
 809                         len += pa;
 810                     }
 811 #else // !WC_UTF16
 812                     if (buf)
 813                         *buf++ = (wchar_t)res;
 814                     len++;
 815 #endif // WC_UTF16/!WC_UTF16
 816                 }
 817             }
 818
 819             if (invalid)
 820             {
 821                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 822                 {
 823                     while (opsz < psz && (!buf || len < n))
 824                     {
 825 #ifdef WC_UTF16
 826                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 827                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 828                         wxASSERT(pa != wxCONV_FAILED);
 829                         if (buf)
 830                             buf += pa;
 831                         opsz++;
 832                         len += pa;
 833 #else
 834                         if (buf)
 835                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 836                         opsz++;
 837                         len++;
 838 #endif
 839                     }
 840                 }
 841                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 842                 {
 843                     while (opsz < psz && (!buf || len < n))
 844                     {
 845                         if ( buf && len + 3 < n )
 846                         {
 847                             unsigned char on = *opsz;
 848                             *buf++ = L'\\';
 849                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 850                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 851                             *buf++ = (wchar_t)( L'0' + on % 010 );
 852                         }
 853
 854                         opsz++;
 855                         len += 4;
 856                     }
 857                 }
 858                 else // MAP_INVALID_UTF8_NOT
 859                 {
 860                     return wxCONV_FAILED;
 861                 }
 862             }
 863         }
 864     }
 865
 866     if (buf && (len < n))
 867         *buf = 0;
 868
 869     return len;
 870 }
 871
 872 static inline bool isoctal(wchar_t wch)
 873 {
 874     return L'0' <= wch && wch <= L'7';
 875 }
 876
 877 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 878 {
 879     size_t len = 0;
 880
 881     while (*psz && ((!buf) || (len < n)))
 882     {
 883         wxUint32 cc;
 884
 885 #ifdef WC_UTF16
 886         // cast is ok for WC_UTF16
 887         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 888         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 889 #else
 890         cc = (*psz++) & 0x7fffffff;
 891 #endif
 892
 893         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 894                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 895         {
 896             if (buf)
 897                 *buf++ = (char)(cc - wxUnicodePUA);
 898             len++;
 899         }
 900         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 901                     && cc == L'\\' && psz[0] == L'\\' )
 902         {
 903             if (buf)
 904                 *buf++ = (char)cc;
 905             psz++;
 906             len++;
 907         }
 908         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 909                     cc == L'\\' &&
 910                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 911         {
 912             if (buf)
 913             {
 914                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 915                                  (psz[1] - L'0') * 010 +
 916                                  (psz[2] - L'0'));
 917             }
 918
 919             psz += 3;
 920             len++;
 921         }
 922         else
 923         {
 924             unsigned cnt;
 925             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 926             {
 927             }
 928
 929             if (!cnt)
 930             {
 931                 // plain ASCII char
 932                 if (buf)
 933                     *buf++ = (char) cc;
 934                 len++;
 935             }
 936             else
 937             {
 938                 len += cnt + 1;
 939                 if (buf)
 940                 {
 941                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 942                     while (cnt--)
 943                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 944                 }
 945             }
 946         }
 947     }
 948
 949     if (buf && (len < n))
 950         *buf = 0;
 951
 952     return len;
 953 }
 954
 955 // ============================================================================
 956 // UTF-16
 957 // ============================================================================
 958
 959 #ifdef WORDS_BIGENDIAN
 960     #define wxMBConvUTF16straight wxMBConvUTF16BE
 961     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 962 #else
 963     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 964     #define wxMBConvUTF16straight wxMBConvUTF16LE
 965 #endif
 966
 967 /* static */
 968 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 969 {
 970     if ( srcLen == wxNO_LEN )
 971     {
 972         // count the number of bytes in input, including the trailing NULs
 973         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 974         for ( srcLen = 1; *inBuff++; srcLen++ )
 975             ;
 976
 977         srcLen *= BYTES_PER_CHAR;
 978     }
 979     else // we already have the length
 980     {
 981         // we can only convert an entire number of UTF-16 characters
 982         if ( srcLen % BYTES_PER_CHAR )
 983             return wxCONV_FAILED;
 984     }
 985
 986     return srcLen;
 987 }
 988
 989 // case when in-memory representation is UTF-16 too
 990 #ifdef WC_UTF16
 991
 992 // ----------------------------------------------------------------------------
 993 // conversions without endianness change
 994 // ----------------------------------------------------------------------------
 995
 996 size_t
 997 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 998                                const char *src, size_t srcLen) const
 999 {
1000     // set up the scene for using memcpy() (which is presumably more efficient
1001     // than copying the bytes one by one)
1002     srcLen = GetLength(src, srcLen);
1003     if ( srcLen == wxNO_LEN )
1004         return wxCONV_FAILED;
1005
1006     const size_t inLen = srcLen / BYTES_PER_CHAR;
1007     if ( dst )
1008     {
1009         if ( dstLen < inLen )
1010             return wxCONV_FAILED;
1011
1012         memcpy(dst, src, srcLen);
1013     }
1014
1015     return inLen;
1016 }
1017
1018 size_t
1019 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1020                                  const wchar_t *src, size_t srcLen) const
1021 {
1022     if ( srcLen == wxNO_LEN )
1023         srcLen = wxWcslen(src) + 1;
1024
1025     srcLen *= BYTES_PER_CHAR;
1026
1027     if ( dst )
1028     {
1029         if ( dstLen < srcLen )
1030             return wxCONV_FAILED;
1031
1032         memcpy(dst, src, srcLen);
1033     }
1034
1035     return srcLen;
1036 }
1037
1038 // ----------------------------------------------------------------------------
1039 // endian-reversing conversions
1040 // ----------------------------------------------------------------------------
1041
1042 size_t
1043 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1044                            const char *src, size_t srcLen) const
1045 {
1046     srcLen = GetLength(src, srcLen);
1047     if ( srcLen == wxNO_LEN )
1048         return wxCONV_FAILED;
1049
1050     srcLen /= BYTES_PER_CHAR;
1051
1052     if ( dst )
1053     {
1054         if ( dstLen < srcLen )
1055             return wxCONV_FAILED;
1056
1057         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1058         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1059         {
1060             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1061         }
1062     }
1063
1064     return srcLen;
1065 }
1066
1067 size_t
1068 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1069                              const wchar_t *src, size_t srcLen) const
1070 {
1071     if ( srcLen == wxNO_LEN )
1072         srcLen = wxWcslen(src) + 1;
1073
1074     srcLen *= BYTES_PER_CHAR;
1075
1076     if ( dst )
1077     {
1078         if ( dstLen < srcLen )
1079             return wxCONV_FAILED;
1080
1081         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1082         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1083         {
1084             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1085         }
1086     }
1087
1088     return srcLen;
1089 }
1090
1091 #else // !WC_UTF16: wchar_t is UTF-32
1092
1093 // ----------------------------------------------------------------------------
1094 // conversions without endianness change
1095 // ----------------------------------------------------------------------------
1096
1097 size_t
1098 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1099                                const char *src, size_t srcLen) const
1100 {
1101     srcLen = GetLength(src, srcLen);
1102     if ( srcLen == wxNO_LEN )
1103         return wxCONV_FAILED;
1104
1105     const size_t inLen = srcLen / BYTES_PER_CHAR;
1106     if ( !dst )
1107     {
1108         // optimization: return maximal space which could be needed for this
1109         // string even if the real size could be smaller if the buffer contains
1110         // any surrogates
1111         return inLen;
1112     }
1113
1114     size_t outLen = 0;
1115     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1116     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1117     {
1118         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1119         if ( !inBuff )
1120             return wxCONV_FAILED;
1121
1122         if ( ++outLen > dstLen )
1123             return wxCONV_FAILED;
1124
1125         *dst++ = ch;
1126     }
1127
1128
1129     return outLen;
1130 }
1131
1132 size_t
1133 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1134                                  const wchar_t *src, size_t srcLen) const
1135 {
1136     if ( srcLen == wxNO_LEN )
1137         srcLen = wxWcslen(src) + 1;
1138
1139     size_t outLen = 0;
1140     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1141     for ( size_t n = 0; n < srcLen; n++ )
1142     {
1143         wxUint16 cc[2];
1144         const size_t numChars = encode_utf16(*src++, cc);
1145         if ( numChars == wxCONV_FAILED )
1146             return wxCONV_FAILED;
1147
1148         outLen += numChars * BYTES_PER_CHAR;
1149         if ( outBuff )
1150         {
1151             if ( outLen > dstLen )
1152                 return wxCONV_FAILED;
1153
1154             *outBuff++ = cc[0];
1155             if ( numChars == 2 )
1156             {
1157                 // second character of a surrogate
1158                 *outBuff++ = cc[1];
1159             }
1160         }
1161     }
1162
1163     return outLen;
1164 }
1165
1166 // ----------------------------------------------------------------------------
1167 // endian-reversing conversions
1168 // ----------------------------------------------------------------------------
1169
1170 size_t
1171 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1172                            const char *src, size_t srcLen) const
1173 {
1174     srcLen = GetLength(src, srcLen);
1175     if ( srcLen == wxNO_LEN )
1176         return wxCONV_FAILED;
1177
1178     const size_t inLen = srcLen / BYTES_PER_CHAR;
1179     if ( !dst )
1180     {
1181         // optimization: return maximal space which could be needed for this
1182         // string even if the real size could be smaller if the buffer contains
1183         // any surrogates
1184         return inLen;
1185     }
1186
1187     size_t outLen = 0;
1188     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1189     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1190     {
1191         wxUint32 ch;
1192         wxUint16 tmp[2];
1193
1194         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1195         inBuff++;
1196         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1197
1198         const size_t numChars = decode_utf16(tmp, ch);
1199         if ( numChars == wxCONV_FAILED )
1200             return wxCONV_FAILED;
1201
1202         if ( numChars == 2 )
1203             inBuff++;
1204
1205         if ( ++outLen > dstLen )
1206             return wxCONV_FAILED;
1207
1208         *dst++ = ch;
1209     }
1210
1211
1212     return outLen;
1213 }
1214
1215 size_t
1216 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1217                              const wchar_t *src, size_t srcLen) const
1218 {
1219     if ( srcLen == wxNO_LEN )
1220         srcLen = wxWcslen(src) + 1;
1221
1222     size_t outLen = 0;
1223     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1224     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1225     {
1226         wxUint16 cc[2];
1227         const size_t numChars = encode_utf16(*src, cc);
1228         if ( numChars == wxCONV_FAILED )
1229             return wxCONV_FAILED;
1230
1231         outLen += numChars * BYTES_PER_CHAR;
1232         if ( outBuff )
1233         {
1234             if ( outLen > dstLen )
1235                 return wxCONV_FAILED;
1236
1237             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1238             if ( numChars == 2 )
1239             {
1240                 // second character of a surrogate
1241                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1242             }
1243         }
1244     }
1245
1246     return outLen;
1247 }
1248
1249 #endif // WC_UTF16/!WC_UTF16
1250
1251
1252 // ============================================================================
1253 // UTF-32
1254 // ============================================================================
1255
1256 #ifdef WORDS_BIGENDIAN
1257     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1258     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1259 #else
1260     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1261     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1262 #endif
1263
1264
1265 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1266 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1267
1268 /* static */
1269 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1270 {
1271     if ( srcLen == wxNO_LEN )
1272     {
1273         // count the number of bytes in input, including the trailing NULs
1274         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1275         for ( srcLen = 1; *inBuff++; srcLen++ )
1276             ;
1277
1278         srcLen *= BYTES_PER_CHAR;
1279     }
1280     else // we already have the length
1281     {
1282         // we can only convert an entire number of UTF-32 characters
1283         if ( srcLen % BYTES_PER_CHAR )
1284             return wxCONV_FAILED;
1285     }
1286
1287     return srcLen;
1288 }
1289
1290 // case when in-memory representation is UTF-16
1291 #ifdef WC_UTF16
1292
1293 // ----------------------------------------------------------------------------
1294 // conversions without endianness change
1295 // ----------------------------------------------------------------------------
1296
1297 size_t
1298 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1299                                const char *src, size_t srcLen) const
1300 {
1301     srcLen = GetLength(src, srcLen);
1302     if ( srcLen == wxNO_LEN )
1303         return wxCONV_FAILED;
1304
1305     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1306     const size_t inLen = srcLen / BYTES_PER_CHAR;
1307     size_t outLen = 0;
1308     for ( size_t n = 0; n < inLen; n++ )
1309     {
1310         wxUint16 cc[2];
1311         const size_t numChars = encode_utf16(*inBuff++, cc);
1312         if ( numChars == wxCONV_FAILED )
1313             return wxCONV_FAILED;
1314
1315         outLen += numChars;
1316         if ( dst )
1317         {
1318             if ( outLen > dstLen )
1319                 return wxCONV_FAILED;
1320
1321             *dst++ = cc[0];
1322             if ( numChars == 2 )
1323             {
1324                 // second character of a surrogate
1325                 *dst++ = cc[1];
1326             }
1327         }
1328     }
1329
1330     return outLen;
1331 }
1332
1333 size_t
1334 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1335                                  const wchar_t *src, size_t srcLen) const
1336 {
1337     if ( srcLen == wxNO_LEN )
1338         srcLen = wxWcslen(src) + 1;
1339
1340     if ( !dst )
1341     {
1342         // optimization: return maximal space which could be needed for this
1343         // string instead of the exact amount which could be less if there are
1344         // any surrogates in the input
1345         //
1346         // we consider that surrogates are rare enough to make it worthwhile to
1347         // avoid running the loop below at the cost of slightly extra memory
1348         // consumption
1349         return srcLen * BYTES_PER_CHAR;
1350     }
1351
1352     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1353     size_t outLen = 0;
1354     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1355     {
1356         const wxUint32 ch = wxDecodeSurrogate(&src);
1357         if ( !src )
1358             return wxCONV_FAILED;
1359
1360         outLen += BYTES_PER_CHAR;
1361
1362         if ( outLen > dstLen )
1363             return wxCONV_FAILED;
1364
1365         *outBuff++ = ch;
1366     }
1367
1368     return outLen;
1369 }
1370
1371 // ----------------------------------------------------------------------------
1372 // endian-reversing conversions
1373 // ----------------------------------------------------------------------------
1374
1375 size_t
1376 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1377                            const char *src, size_t srcLen) const
1378 {
1379     srcLen = GetLength(src, srcLen);
1380     if ( srcLen == wxNO_LEN )
1381         return wxCONV_FAILED;
1382
1383     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1384     const size_t inLen = srcLen / BYTES_PER_CHAR;
1385     size_t outLen = 0;
1386     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1387     {
1388         wxUint16 cc[2];
1389         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1390         if ( numChars == wxCONV_FAILED )
1391             return wxCONV_FAILED;
1392
1393         outLen += numChars;
1394         if ( dst )
1395         {
1396             if ( outLen > dstLen )
1397                 return wxCONV_FAILED;
1398
1399             *dst++ = cc[0];
1400             if ( numChars == 2 )
1401             {
1402                 // second character of a surrogate
1403                 *dst++ = cc[1];
1404             }
1405         }
1406     }
1407
1408     return outLen;
1409 }
1410
1411 size_t
1412 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1413                              const wchar_t *src, size_t srcLen) const
1414 {
1415     if ( srcLen == wxNO_LEN )
1416         srcLen = wxWcslen(src) + 1;
1417
1418     if ( !dst )
1419     {
1420         // optimization: return maximal space which could be needed for this
1421         // string instead of the exact amount which could be less if there are
1422         // any surrogates in the input
1423         //
1424         // we consider that surrogates are rare enough to make it worthwhile to
1425         // avoid running the loop below at the cost of slightly extra memory
1426         // consumption
1427         return srcLen*BYTES_PER_CHAR;
1428     }
1429
1430     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1431     size_t outLen = 0;
1432     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1433     {
1434         const wxUint32 ch = wxDecodeSurrogate(&src);
1435         if ( !src )
1436             return wxCONV_FAILED;
1437
1438         outLen += BYTES_PER_CHAR;
1439
1440         if ( outLen > dstLen )
1441             return wxCONV_FAILED;
1442
1443         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1444     }
1445
1446     return outLen;
1447 }
1448
1449 #else // !WC_UTF16: wchar_t is UTF-32
1450
1451 // ----------------------------------------------------------------------------
1452 // conversions without endianness change
1453 // ----------------------------------------------------------------------------
1454
1455 size_t
1456 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1457                                const char *src, size_t srcLen) const
1458 {
1459     // use memcpy() as it should be much faster than hand-written loop
1460     srcLen = GetLength(src, srcLen);
1461     if ( srcLen == wxNO_LEN )
1462         return wxCONV_FAILED;
1463
1464     const size_t inLen = srcLen/BYTES_PER_CHAR;
1465     if ( dst )
1466     {
1467         if ( dstLen < inLen )
1468             return wxCONV_FAILED;
1469
1470         memcpy(dst, src, srcLen);
1471     }
1472
1473     return inLen;
1474 }
1475
1476 size_t
1477 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1478                                  const wchar_t *src, size_t srcLen) const
1479 {
1480     if ( srcLen == wxNO_LEN )
1481         srcLen = wxWcslen(src) + 1;
1482
1483     srcLen *= BYTES_PER_CHAR;
1484
1485     if ( dst )
1486     {
1487         if ( dstLen < srcLen )
1488             return wxCONV_FAILED;
1489
1490         memcpy(dst, src, srcLen);
1491     }
1492
1493     return srcLen;
1494 }
1495
1496 // ----------------------------------------------------------------------------
1497 // endian-reversing conversions
1498 // ----------------------------------------------------------------------------
1499
1500 size_t
1501 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1502                            const char *src, size_t srcLen) const
1503 {
1504     srcLen = GetLength(src, srcLen);
1505     if ( srcLen == wxNO_LEN )
1506         return wxCONV_FAILED;
1507
1508     srcLen /= BYTES_PER_CHAR;
1509
1510     if ( dst )
1511     {
1512         if ( dstLen < srcLen )
1513             return wxCONV_FAILED;
1514
1515         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1516         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1517         {
1518             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1519         }
1520     }
1521
1522     return srcLen;
1523 }
1524
1525 size_t
1526 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1527                              const wchar_t *src, size_t srcLen) const
1528 {
1529     if ( srcLen == wxNO_LEN )
1530         srcLen = wxWcslen(src) + 1;
1531
1532     srcLen *= BYTES_PER_CHAR;
1533
1534     if ( dst )
1535     {
1536         if ( dstLen < srcLen )
1537             return wxCONV_FAILED;
1538
1539         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1540         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1541         {
1542             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1543         }
1544     }
1545
1546     return srcLen;
1547 }
1548
1549 #endif // WC_UTF16/!WC_UTF16
1550
1551
1552 // ============================================================================
1553 // The classes doing conversion using the iconv_xxx() functions
1554 // ============================================================================
1555
1556 #ifdef HAVE_ICONV
1557
1558 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1559 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1560 //     (unless there's yet another bug in glibc) the only case when iconv()
1561 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1562 //     left in the input buffer -- when _real_ error occurs,
1563 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1564 //     iconv() failure.
1565 //     [This bug does not appear in glibc 2.2.]
1566 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1567 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1568                                      (errno != E2BIG || bufLeft != 0))
1569 #else
1570 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1571 #endif
1572
1573 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1574
1575 #define ICONV_T_INVALID ((iconv_t)-1)
1576
1577 #if SIZEOF_WCHAR_T == 4
1578     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1579     #define WC_ENC      wxFONTENCODING_UTF32
1580 #elif SIZEOF_WCHAR_T == 2
1581     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1582     #define WC_ENC      wxFONTENCODING_UTF16
1583 #else // sizeof(wchar_t) != 2 nor 4
1584     // does this ever happen?
1585     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1586 #endif
1587
1588 // ----------------------------------------------------------------------------
1589 // wxMBConv_iconv: encapsulates an iconv character set
1590 // ----------------------------------------------------------------------------
1591
1592 class wxMBConv_iconv : public wxMBConv
1593 {
1594 public:
1595     wxMBConv_iconv(const char *name);
1596     virtual ~wxMBConv_iconv();
1597
1598     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1599     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1600
1601     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1602     virtual size_t GetMBNulLen() const;
1603
1604 #if wxUSE_UNICODE_UTF8
1605     virtual bool IsUTF8() const;
1606 #endif
1607
1608     virtual wxMBConv *Clone() const
1609     {
1610         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1611         p->m_minMBCharWidth = m_minMBCharWidth;
1612         return p;
1613     }
1614
1615     bool IsOk() const
1616         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1617
1618 protected:
1619     // the iconv handlers used to translate from multibyte
1620     // to wide char and in the other direction
1621     iconv_t m2w,
1622             w2m;
1623
1624 #if wxUSE_THREADS
1625     // guards access to m2w and w2m objects
1626     wxMutex m_iconvMutex;
1627 #endif
1628
1629 private:
1630     // the name (for iconv_open()) of a wide char charset -- if none is
1631     // available on this machine, it will remain NULL
1632     static wxString ms_wcCharsetName;
1633
1634     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1635     // different endian-ness than the native one
1636     static bool ms_wcNeedsSwap;
1637
1638
1639     // name of the encoding handled by this conversion
1640     wxString m_name;
1641
1642     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1643     // initially
1644     size_t m_minMBCharWidth;
1645 };
1646
1647 // make the constructor available for unit testing
1648 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1649 {
1650     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1651     if ( !result->IsOk() )
1652     {
1653         delete result;
1654         return 0;
1655     }
1656
1657     return result;
1658 }
1659
1660 wxString wxMBConv_iconv::ms_wcCharsetName;
1661 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1662
1663 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1664               : m_name(name)
1665 {
1666     m_minMBCharWidth = 0;
1667
1668     // check for charset that represents wchar_t:
1669     if ( ms_wcCharsetName.empty() )
1670     {
1671         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1672
1673 #if wxUSE_FONTMAP
1674         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1675 #else // !wxUSE_FONTMAP
1676         static const wxChar *names_static[] =
1677         {
1678 #if SIZEOF_WCHAR_T == 4
1679             _T("UCS-4"),
1680 #elif SIZEOF_WCHAR_T = 2
1681             _T("UCS-2"),
1682 #endif
1683             NULL
1684         };
1685         const wxChar **names = names_static;
1686 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1687
1688         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1689         {
1690             const wxString nameCS(*names);
1691
1692             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1693             wxString nameXE(nameCS);
1694
1695 #ifdef WORDS_BIGENDIAN
1696                 nameXE += _T("BE");
1697 #else // little endian
1698                 nameXE += _T("LE");
1699 #endif
1700
1701             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1702                        nameXE.c_str());
1703
1704             m2w = iconv_open(nameXE.ToAscii(), name);
1705             if ( m2w == ICONV_T_INVALID )
1706             {
1707                 // try charset w/o bytesex info (e.g. "UCS4")
1708                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1709                            nameCS.c_str());
1710                 m2w = iconv_open(nameCS.ToAscii(), name);
1711
1712                 // and check for bytesex ourselves:
1713                 if ( m2w != ICONV_T_INVALID )
1714                 {
1715                     char    buf[2], *bufPtr;
1716                     wchar_t wbuf[2], *wbufPtr;
1717                     size_t  insz, outsz;
1718                     size_t  res;
1719
1720                     buf[0] = 'A';
1721                     buf[1] = 0;
1722                     wbuf[0] = 0;
1723                     insz = 2;
1724                     outsz = SIZEOF_WCHAR_T * 2;
1725                     wbufPtr = wbuf;
1726                     bufPtr = buf;
1727
1728                     res = iconv(
1729                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1730                         (char**)&wbufPtr, &outsz);
1731
1732                     if (ICONV_FAILED(res, insz))
1733                     {
1734                         wxLogLastError(wxT("iconv"));
1735                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1736                                    nameCS.c_str());
1737                     }
1738                     else // ok, can convert to this encoding, remember it
1739                     {
1740                         ms_wcCharsetName = nameCS;
1741                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1742                     }
1743                 }
1744             }
1745             else // use charset not requiring byte swapping
1746             {
1747                 ms_wcCharsetName = nameXE;
1748             }
1749         }
1750
1751         wxLogTrace(TRACE_STRCONV,
1752                    wxT("iconv wchar_t charset is \"%s\"%s"),
1753                    ms_wcCharsetName.empty() ? wxString("<none>")
1754                                             : ms_wcCharsetName,
1755                    ms_wcNeedsSwap ? _T(" (needs swap)")
1756                                   : _T(""));
1757     }
1758     else // we already have ms_wcCharsetName
1759     {
1760         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
1761     }
1762
1763     if ( ms_wcCharsetName.empty() )
1764     {
1765         w2m = ICONV_T_INVALID;
1766     }
1767     else
1768     {
1769         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
1770         if ( w2m == ICONV_T_INVALID )
1771         {
1772             wxLogTrace(TRACE_STRCONV,
1773                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1774                        ms_wcCharsetName.c_str(), name);
1775         }
1776     }
1777 }
1778
1779 wxMBConv_iconv::~wxMBConv_iconv()
1780 {
1781     if ( m2w != ICONV_T_INVALID )
1782         iconv_close(m2w);
1783     if ( w2m != ICONV_T_INVALID )
1784         iconv_close(w2m);
1785 }
1786
1787 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1788 {
1789     // find the string length: notice that must be done differently for
1790     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1791     size_t inbuf;
1792     const size_t nulLen = GetMBNulLen();
1793     switch ( nulLen )
1794     {
1795         default:
1796             return wxCONV_FAILED;
1797
1798         case 1:
1799             inbuf = strlen(psz); // arguably more optimized than our version
1800             break;
1801
1802         case 2:
1803         case 4:
1804             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1805             // they also have to start at character boundary and not span two
1806             // adjacent characters
1807             const char *p;
1808             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1809                 ;
1810             inbuf = p - psz;
1811             break;
1812     }
1813
1814 #if wxUSE_THREADS
1815     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1816     //     Unfortunately there are a couple of global wxCSConv objects such as
1817     //     wxConvLocal that are used all over wx code, so we have to make sure
1818     //     the handle is used by at most one thread at the time. Otherwise
1819     //     only a few wx classes would be safe to use from non-main threads
1820     //     as MB<->WC conversion would fail "randomly".
1821     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1822 #endif // wxUSE_THREADS
1823
1824     size_t outbuf = n * SIZEOF_WCHAR_T;
1825     size_t res, cres;
1826     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1827     wchar_t *bufPtr = buf;
1828     const char *pszPtr = psz;
1829
1830     if (buf)
1831     {
1832         // have destination buffer, convert there
1833         cres = iconv(m2w,
1834                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1835                      (char**)&bufPtr, &outbuf);
1836         res = n - (outbuf / SIZEOF_WCHAR_T);
1837
1838         if (ms_wcNeedsSwap)
1839         {
1840             // convert to native endianness
1841             for ( unsigned i = 0; i < res; i++ )
1842                 buf[n] = WC_BSWAP(buf[i]);
1843         }
1844
1845         // NUL-terminate the string if there is any space left
1846         if (res < n)
1847             buf[res] = 0;
1848     }
1849     else
1850     {
1851         // no destination buffer... convert using temp buffer
1852         // to calculate destination buffer requirement
1853         wchar_t tbuf[8];
1854         res = 0;
1855
1856         do
1857         {
1858             bufPtr = tbuf;
1859             outbuf = 8 * SIZEOF_WCHAR_T;
1860
1861             cres = iconv(m2w,
1862                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1863                          (char**)&bufPtr, &outbuf );
1864
1865             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1866         }
1867         while ((cres == (size_t)-1) && (errno == E2BIG));
1868     }
1869
1870     if (ICONV_FAILED(cres, inbuf))
1871     {
1872         //VS: it is ok if iconv fails, hence trace only
1873         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1874         return wxCONV_FAILED;
1875     }
1876
1877     return res;
1878 }
1879
1880 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1881 {
1882 #if wxUSE_THREADS
1883     // NB: explained in MB2WC
1884     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1885 #endif
1886
1887     size_t inlen = wxWcslen(psz);
1888     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1889     size_t outbuf = n;
1890     size_t res, cres;
1891
1892     wchar_t *tmpbuf = 0;
1893
1894     if (ms_wcNeedsSwap)
1895     {
1896         // need to copy to temp buffer to switch endianness
1897         // (doing WC_BSWAP twice on the original buffer won't help, as it
1898         //  could be in read-only memory, or be accessed in some other thread)
1899         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1900         for ( size_t i = 0; i < inlen; i++ )
1901             tmpbuf[n] = WC_BSWAP(psz[i]);
1902
1903         tmpbuf[inlen] = L'\0';
1904         psz = tmpbuf;
1905     }
1906
1907     if (buf)
1908     {
1909         // have destination buffer, convert there
1910         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1911
1912         res = n - outbuf;
1913
1914         // NB: iconv was given only wcslen(psz) characters on input, and so
1915         //     it couldn't convert the trailing zero. Let's do it ourselves
1916         //     if there's some room left for it in the output buffer.
1917         if (res < n)
1918             buf[0] = 0;
1919     }
1920     else
1921     {
1922         // no destination buffer: convert using temp buffer
1923         // to calculate destination buffer requirement
1924         char tbuf[16];
1925         res = 0;
1926         do
1927         {
1928             buf = tbuf;
1929             outbuf = 16;
1930
1931             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1932
1933             res += 16 - outbuf;
1934         }
1935         while ((cres == (size_t)-1) && (errno == E2BIG));
1936     }
1937
1938     if (ms_wcNeedsSwap)
1939     {
1940         free(tmpbuf);
1941     }
1942
1943     if (ICONV_FAILED(cres, inbuf))
1944     {
1945         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1946         return wxCONV_FAILED;
1947     }
1948
1949     return res;
1950 }
1951
1952 size_t wxMBConv_iconv::GetMBNulLen() const
1953 {
1954     if ( m_minMBCharWidth == 0 )
1955     {
1956         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1957
1958 #if wxUSE_THREADS
1959         // NB: explained in MB2WC
1960         wxMutexLocker lock(self->m_iconvMutex);
1961 #endif
1962
1963         const wchar_t *wnul = L"";
1964         char buf[8]; // should be enough for NUL in any encoding
1965         size_t inLen = sizeof(wchar_t),
1966                outLen = WXSIZEOF(buf);
1967         char *inBuff = (char *)wnul;
1968         char *outBuff = buf;
1969         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1970         {
1971             self->m_minMBCharWidth = (size_t)-1;
1972         }
1973         else // ok
1974         {
1975             self->m_minMBCharWidth = outBuff - buf;
1976         }
1977     }
1978
1979     return m_minMBCharWidth;
1980 }
1981
1982 #if wxUSE_UNICODE_UTF8
1983 bool wxMBConv_iconv::IsUTF8() const
1984 {
1985     return wxStricmp(m_name, "UTF-8") == 0 ||
1986            wxStricmp(m_name, "UTF8") == 0;
1987 }
1988 #endif
1989
1990 #endif // HAVE_ICONV
1991
1992
1993 // ============================================================================
1994 // Win32 conversion classes
1995 // ============================================================================
1996
1997 #ifdef wxHAVE_WIN32_MB2WC
1998
1999 // from utils.cpp
2000 #if wxUSE_FONTMAP
2001 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2002 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2003 #endif
2004
2005 class wxMBConv_win32 : public wxMBConv
2006 {
2007 public:
2008     wxMBConv_win32()
2009     {
2010         m_CodePage = CP_ACP;
2011         m_minMBCharWidth = 0;
2012     }
2013
2014     wxMBConv_win32(const wxMBConv_win32& conv)
2015         : wxMBConv()
2016     {
2017         m_CodePage = conv.m_CodePage;
2018         m_minMBCharWidth = conv.m_minMBCharWidth;
2019     }
2020
2021 #if wxUSE_FONTMAP
2022     wxMBConv_win32(const char* name)
2023     {
2024         m_CodePage = wxCharsetToCodepage(name);
2025         m_minMBCharWidth = 0;
2026     }
2027
2028     wxMBConv_win32(wxFontEncoding encoding)
2029     {
2030         m_CodePage = wxEncodingToCodepage(encoding);
2031         m_minMBCharWidth = 0;
2032     }
2033 #endif // wxUSE_FONTMAP
2034
2035     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2036     {
2037         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2038         // the behaviour is not compatible with the Unix version (using iconv)
2039         // and break the library itself, e.g. wxTextInputStream::NextChar()
2040         // wouldn't work if reading an incomplete MB char didn't result in an
2041         // error
2042         //
2043         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2044         // Win XP or newer and it is not supported for UTF-[78] so we always
2045         // use our own conversions in this case. See
2046         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2047         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2048         if ( m_CodePage == CP_UTF8 )
2049         {
2050             return wxMBConvUTF8().MB2WC(buf, psz, n);
2051         }
2052
2053         if ( m_CodePage == CP_UTF7 )
2054         {
2055             return wxMBConvUTF7().MB2WC(buf, psz, n);
2056         }
2057
2058         int flags = 0;
2059         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2060                 IsAtLeastWin2kSP4() )
2061         {
2062             flags = MB_ERR_INVALID_CHARS;
2063         }
2064
2065         const size_t len = ::MultiByteToWideChar
2066                              (
2067                                 m_CodePage,     // code page
2068                                 flags,          // flags: fall on error
2069                                 psz,            // input string
2070                                 -1,             // its length (NUL-terminated)
2071                                 buf,            // output string
2072                                 buf ? n : 0     // size of output buffer
2073                              );
2074         if ( !len )
2075         {
2076             // function totally failed
2077             return wxCONV_FAILED;
2078         }
2079
2080         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2081         // check if we succeeded, by doing a double trip:
2082         if ( !flags && buf )
2083         {
2084             const size_t mbLen = strlen(psz);
2085             wxCharBuffer mbBuf(mbLen);
2086             if ( ::WideCharToMultiByte
2087                    (
2088                       m_CodePage,
2089                       0,
2090                       buf,
2091                       -1,
2092                       mbBuf.data(),
2093                       mbLen + 1,        // size in bytes, not length
2094                       NULL,
2095                       NULL
2096                    ) == 0 ||
2097                   strcmp(mbBuf, psz) != 0 )
2098             {
2099                 // we didn't obtain the same thing we started from, hence
2100                 // the conversion was lossy and we consider that it failed
2101                 return wxCONV_FAILED;
2102             }
2103         }
2104
2105         // note that it returns count of written chars for buf != NULL and size
2106         // of the needed buffer for buf == NULL so in either case the length of
2107         // the string (which never includes the terminating NUL) is one less
2108         return len - 1;
2109     }
2110
2111     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2112     {
2113         /*
2114             we have a problem here: by default, WideCharToMultiByte() may
2115             replace characters unrepresentable in the target code page with bad
2116             quality approximations such as turning "1/2" symbol (U+00BD) into
2117             "1" for the code pages which don't have it and we, obviously, want
2118             to avoid this at any price
2119
2120             the trouble is that this function does it _silently_, i.e. it won't
2121             even tell us whether it did or not... Win98/2000 and higher provide
2122             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2123             we have to resort to a round trip, i.e. check that converting back
2124             results in the same string -- this is, of course, expensive but
2125             otherwise we simply can't be sure to not garble the data.
2126          */
2127
2128         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2129         // it doesn't work with CJK encodings (which we test for rather roughly
2130         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2131         // supporting it
2132         BOOL usedDef wxDUMMY_INITIALIZE(false);
2133         BOOL *pUsedDef;
2134         int flags;
2135         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2136         {
2137             // it's our lucky day
2138             flags = WC_NO_BEST_FIT_CHARS;
2139             pUsedDef = &usedDef;
2140         }
2141         else // old system or unsupported encoding
2142         {
2143             flags = 0;
2144             pUsedDef = NULL;
2145         }
2146
2147         const size_t len = ::WideCharToMultiByte
2148                              (
2149                                 m_CodePage,     // code page
2150                                 flags,          // either none or no best fit
2151                                 pwz,            // input string
2152                                 -1,             // it is (wide) NUL-terminated
2153                                 buf,            // output buffer
2154                                 buf ? n : 0,    // and its size
2155                                 NULL,           // default "replacement" char
2156                                 pUsedDef        // [out] was it used?
2157                              );
2158
2159         if ( !len )
2160         {
2161             // function totally failed
2162             return wxCONV_FAILED;
2163         }
2164
2165         // if we were really converting, check if we succeeded
2166         if ( buf )
2167         {
2168             if ( flags )
2169             {
2170                 // check if the conversion failed, i.e. if any replacements
2171                 // were done
2172                 if ( usedDef )
2173                     return wxCONV_FAILED;
2174             }
2175             else // we must resort to double tripping...
2176             {
2177                 wxWCharBuffer wcBuf(n);
2178                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2179                         wcscmp(wcBuf, pwz) != 0 )
2180                 {
2181                     // we didn't obtain the same thing we started from, hence
2182                     // the conversion was lossy and we consider that it failed
2183                     return wxCONV_FAILED;
2184                 }
2185             }
2186         }
2187
2188         // see the comment above for the reason of "len - 1"
2189         return len - 1;
2190     }
2191
2192     virtual size_t GetMBNulLen() const
2193     {
2194         if ( m_minMBCharWidth == 0 )
2195         {
2196             int len = ::WideCharToMultiByte
2197                         (
2198                             m_CodePage,     // code page
2199                             0,              // no flags
2200                             L"",            // input string
2201                             1,              // translate just the NUL
2202                             NULL,           // output buffer
2203                             0,              // and its size
2204                             NULL,           // no replacement char
2205                             NULL            // [out] don't care if it was used
2206                         );
2207
2208             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2209             switch ( len )
2210             {
2211                 default:
2212                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2213                     self->m_minMBCharWidth = (size_t)-1;
2214                     break;
2215
2216                 case 0:
2217                     self->m_minMBCharWidth = (size_t)-1;
2218                     break;
2219
2220                 case 1:
2221                 case 2:
2222                 case 4:
2223                     self->m_minMBCharWidth = len;
2224                     break;
2225             }
2226         }
2227
2228         return m_minMBCharWidth;
2229     }
2230
2231     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2232
2233     bool IsOk() const { return m_CodePage != -1; }
2234
2235 private:
2236     static bool CanUseNoBestFit()
2237     {
2238         static int s_isWin98Or2k = -1;
2239
2240         if ( s_isWin98Or2k == -1 )
2241         {
2242             int verMaj, verMin;
2243             switch ( wxGetOsVersion(&verMaj, &verMin) )
2244             {
2245                 case wxOS_WINDOWS_9X:
2246                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2247                     break;
2248
2249                 case wxOS_WINDOWS_NT:
2250                     s_isWin98Or2k = verMaj >= 5;
2251                     break;
2252
2253                 default:
2254                     // unknown: be conservative by default
2255                     s_isWin98Or2k = 0;
2256                     break;
2257             }
2258
2259             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2260         }
2261
2262         return s_isWin98Or2k == 1;
2263     }
2264
2265     static bool IsAtLeastWin2kSP4()
2266     {
2267 #ifdef __WXWINCE__
2268         return false;
2269 #else
2270         static int s_isAtLeastWin2kSP4 = -1;
2271
2272         if ( s_isAtLeastWin2kSP4 == -1 )
2273         {
2274             OSVERSIONINFOEX ver;
2275
2276             memset(&ver, 0, sizeof(ver));
2277             ver.dwOSVersionInfoSize = sizeof(ver);
2278             GetVersionEx((OSVERSIONINFO*)&ver);
2279
2280             s_isAtLeastWin2kSP4 =
2281               ((ver.dwMajorVersion > 5) || // Vista+
2282                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2283                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2284                ver.wServicePackMajor >= 4)) // 2000 SP4+
2285               ? 1 : 0;
2286         }
2287
2288         return s_isAtLeastWin2kSP4 == 1;
2289 #endif
2290     }
2291
2292
2293     // the code page we're working with
2294     long m_CodePage;
2295
2296     // cached result of GetMBNulLen(), set to 0 initially meaning
2297     // "unknown"
2298     size_t m_minMBCharWidth;
2299 };
2300
2301 #endif // wxHAVE_WIN32_MB2WC
2302
2303
2304 // ============================================================================
2305 // Mac conversion classes
2306 // ============================================================================
2307
2308 /* Although we are in the base library we currently have this wxMac
2309  * conditional.  This is not generally good but fortunately does not affect
2310  * the ABI of the base library, only what encodings might work.
2311  * It does mean that a wxBase built as part of wxMac has slightly more support
2312  * than one built for wxCocoa or even wxGtk.
2313  */
2314 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2315
2316 class wxMBConv_mac : public wxMBConv
2317 {
2318 public:
2319     wxMBConv_mac()
2320     {
2321         Init(CFStringGetSystemEncoding()) ;
2322     }
2323
2324     wxMBConv_mac(const wxMBConv_mac& conv)
2325     {
2326         Init(conv.m_char_encoding);
2327     }
2328
2329 #if wxUSE_FONTMAP
2330     wxMBConv_mac(const char* name)
2331     {
2332         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2333     }
2334 #endif
2335
2336     wxMBConv_mac(wxFontEncoding encoding)
2337     {
2338         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2339     }
2340
2341     virtual ~wxMBConv_mac()
2342     {
2343         OSStatus status = noErr ;
2344         if (m_MB2WC_converter)
2345             status = TECDisposeConverter(m_MB2WC_converter);
2346         if (m_WC2MB_converter)
2347             status = TECDisposeConverter(m_WC2MB_converter);
2348     }
2349
2350     void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2351             TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2352     {
2353         m_MB2WC_converter = NULL ;
2354         m_WC2MB_converter = NULL ;
2355         m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2356         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2357     }
2358
2359     virtual void CreateIfNeeded() const
2360     {
2361         if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2362         {
2363             OSStatus status = noErr ;
2364             status = TECCreateConverter(&m_MB2WC_converter,
2365                                     m_char_encoding,
2366                                     m_unicode_encoding);
2367             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2368             status = TECCreateConverter(&m_WC2MB_converter,
2369                                     m_unicode_encoding,
2370                                     m_char_encoding);
2371             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2372         }
2373     }
2374
2375     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2376     {
2377         CreateIfNeeded() ;
2378         OSStatus status = noErr ;
2379         ByteCount byteOutLen ;
2380         ByteCount byteInLen = strlen(psz) + 1;
2381         wchar_t *tbuf = NULL ;
2382         UniChar* ubuf = NULL ;
2383         size_t res = 0 ;
2384
2385         if (buf == NULL)
2386         {
2387             // Apple specs say at least 32
2388             n = wxMax( 32, byteInLen ) ;
2389             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2390         }
2391
2392         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2393
2394 #if SIZEOF_WCHAR_T == 4
2395         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2396 #else
2397         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2398 #endif
2399
2400         status = TECConvertText(
2401             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2402             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2403
2404 #if SIZEOF_WCHAR_T == 4
2405         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2406         // is not properly terminated we get random characters at the end
2407         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2408         wxMBConvUTF16 converter ;
2409         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2410         free( ubuf ) ;
2411 #else
2412         res = byteOutLen / sizeof( UniChar ) ;
2413 #endif
2414
2415         if ( buf == NULL )
2416              free(tbuf) ;
2417
2418         if ( buf  && res < n)
2419             buf[res] = 0;
2420
2421         return res ;
2422     }
2423
2424     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2425     {
2426         CreateIfNeeded() ;
2427         OSStatus status = noErr ;
2428         ByteCount byteOutLen ;
2429         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2430
2431         char *tbuf = NULL ;
2432
2433         if (buf == NULL)
2434         {
2435             // Apple specs say at least 32
2436             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2437             tbuf = (char*) malloc( n ) ;
2438         }
2439
2440         ByteCount byteBufferLen = n ;
2441         UniChar* ubuf = NULL ;
2442
2443 #if SIZEOF_WCHAR_T == 4
2444         wxMBConvUTF16 converter ;
2445         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2446         byteInLen = unicharlen ;
2447         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2448         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2449 #else
2450         ubuf = (UniChar*) psz ;
2451 #endif
2452
2453         status = TECConvertText(
2454             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2455             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2456
2457 #if SIZEOF_WCHAR_T == 4
2458         free( ubuf ) ;
2459 #endif
2460
2461         if ( buf == NULL )
2462             free(tbuf) ;
2463
2464         size_t res = byteOutLen ;
2465         if ( buf  && res < n)
2466         {
2467             buf[res] = 0;
2468
2469             //we need to double-trip to verify it didn't insert any ? in place
2470             //of bogus characters
2471             wxWCharBuffer wcBuf(n);
2472             size_t pszlen = wxWcslen(psz);
2473             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2474                         wxWcslen(wcBuf) != pszlen ||
2475                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2476             {
2477                 // we didn't obtain the same thing we started from, hence
2478                 // the conversion was lossy and we consider that it failed
2479                 return wxCONV_FAILED;
2480             }
2481         }
2482
2483         return res ;
2484     }
2485
2486     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2487
2488     bool IsOk() const
2489     {
2490         CreateIfNeeded() ;
2491         return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2492     }
2493
2494 protected :
2495     mutable TECObjectRef m_MB2WC_converter;
2496     mutable TECObjectRef m_WC2MB_converter;
2497
2498     TextEncodingBase m_char_encoding;
2499     TextEncodingBase m_unicode_encoding;
2500 };
2501
2502 // MB is decomposed (D) normalized UTF8
2503
2504 class wxMBConv_macUTF8D : public wxMBConv_mac
2505 {
2506 public :
2507     wxMBConv_macUTF8D()
2508     {
2509         Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2510         m_uni = NULL;
2511         m_uniBack = NULL ;
2512     }
2513
2514     virtual ~wxMBConv_macUTF8D()
2515     {
2516         if (m_uni!=NULL)
2517             DisposeUnicodeToTextInfo(&m_uni);
2518         if (m_uniBack!=NULL)
2519             DisposeUnicodeToTextInfo(&m_uniBack);
2520     }
2521
2522     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2523     {
2524         CreateIfNeeded() ;
2525         OSStatus status = noErr ;
2526         ByteCount byteOutLen ;
2527         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2528
2529         char *tbuf = NULL ;
2530
2531         if (buf == NULL)
2532         {
2533             // Apple specs say at least 32
2534             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2535             tbuf = (char*) malloc( n ) ;
2536         }
2537
2538         ByteCount byteBufferLen = n ;
2539         UniChar* ubuf = NULL ;
2540
2541 #if SIZEOF_WCHAR_T == 4
2542         wxMBConvUTF16 converter ;
2543         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2544         byteInLen = unicharlen ;
2545         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2546         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2547 #else
2548         ubuf = (UniChar*) psz ;
2549 #endif
2550
2551         // ubuf is a non-decomposed UniChar buffer
2552
2553         ByteCount dcubuflen = byteInLen * 2 + 2 ;
2554         ByteCount dcubufread , dcubufwritten ;
2555         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2556
2557         ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2558             kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , dcubuf ) ;
2559
2560         // we now convert that decomposed buffer into UTF8
2561
2562         status = TECConvertText(
2563             m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2564             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2565
2566         free( dcubuf );
2567
2568 #if SIZEOF_WCHAR_T == 4
2569         free( ubuf ) ;
2570 #endif
2571
2572         if ( buf == NULL )
2573             free(tbuf) ;
2574
2575         size_t res = byteOutLen ;
2576         if ( buf  && res < n)
2577         {
2578             buf[res] = 0;
2579             // don't test for round-trip fidelity yet, we cannot guarantee it yet
2580         }
2581
2582         return res ;
2583     }
2584
2585     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2586     {
2587         CreateIfNeeded() ;
2588         OSStatus status = noErr ;
2589         ByteCount byteOutLen ;
2590         ByteCount byteInLen = strlen(psz) + 1;
2591         wchar_t *tbuf = NULL ;
2592         UniChar* ubuf = NULL ;
2593         size_t res = 0 ;
2594
2595         if (buf == NULL)
2596         {
2597             // Apple specs say at least 32
2598             n = wxMax( 32, byteInLen ) ;
2599             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2600         }
2601
2602         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2603
2604 #if SIZEOF_WCHAR_T == 4
2605         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2606 #else
2607         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2608 #endif
2609
2610         ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
2611         ByteCount dcubufread , dcubufwritten ;
2612         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2613
2614         status = TECConvertText(
2615                                 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2616                                 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
2617         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2618         // is not properly terminated we get random characters at the end
2619         dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2620
2621         // now from the decomposed UniChar to properly composed uniChar
2622         ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
2623                                   kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , ubuf ) ;
2624
2625         free( dcubuf );
2626         byteOutLen = dcubufwritten ;
2627         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2628
2629
2630 #if SIZEOF_WCHAR_T == 4
2631         wxMBConvUTF16 converter ;
2632         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2633         free( ubuf ) ;
2634 #else
2635         res = byteOutLen / sizeof( UniChar ) ;
2636 #endif
2637
2638         if ( buf == NULL )
2639             free(tbuf) ;
2640
2641         if ( buf  && res < n)
2642             buf[res] = 0;
2643
2644         return res ;
2645     }
2646
2647     virtual void CreateIfNeeded() const
2648     {
2649         wxMBConv_mac::CreateIfNeeded() ;
2650         if ( m_uni == NULL )
2651         {
2652             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
2653                 kUnicodeNoSubset, kTextEncodingDefaultFormat);
2654             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
2655                 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
2656             m_map.mappingVersion = kUnicodeUseLatestMapping;
2657
2658             OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
2659             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
2660
2661             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
2662                                                        kUnicodeNoSubset, kTextEncodingDefaultFormat);
2663             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
2664                                                      kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
2665             m_map.mappingVersion = kUnicodeUseLatestMapping;
2666             err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
2667             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
2668         }
2669     }
2670 protected :
2671     mutable UnicodeToTextInfo   m_uni;
2672     mutable UnicodeToTextInfo   m_uniBack;
2673     mutable UnicodeMapping      m_map;
2674 };
2675 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2676
2677 // ============================================================================
2678 // wxEncodingConverter based conversion classes
2679 // ============================================================================
2680
2681 #if wxUSE_FONTMAP
2682
2683 class wxMBConv_wxwin : public wxMBConv
2684 {
2685 private:
2686     void Init()
2687     {
2688         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2689                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2690     }
2691
2692 public:
2693     // temporarily just use wxEncodingConverter stuff,
2694     // so that it works while a better implementation is built
2695     wxMBConv_wxwin(const char* name)
2696     {
2697         if (name)
2698             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2699         else
2700             m_enc = wxFONTENCODING_SYSTEM;
2701
2702         Init();
2703     }
2704
2705     wxMBConv_wxwin(wxFontEncoding enc)
2706     {
2707         m_enc = enc;
2708
2709         Init();
2710     }
2711
2712     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2713     {
2714         size_t inbuf = strlen(psz);
2715         if (buf)
2716         {
2717             if (!m2w.Convert(psz, buf))
2718                 return wxCONV_FAILED;
2719         }
2720         return inbuf;
2721     }
2722
2723     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2724     {
2725         const size_t inbuf = wxWcslen(psz);
2726         if (buf)
2727         {
2728             if (!w2m.Convert(psz, buf))
2729                 return wxCONV_FAILED;
2730         }
2731
2732         return inbuf;
2733     }
2734
2735     virtual size_t GetMBNulLen() const
2736     {
2737         switch ( m_enc )
2738         {
2739             case wxFONTENCODING_UTF16BE:
2740             case wxFONTENCODING_UTF16LE:
2741                 return 2;
2742
2743             case wxFONTENCODING_UTF32BE:
2744             case wxFONTENCODING_UTF32LE:
2745                 return 4;
2746
2747             default:
2748                 return 1;
2749         }
2750     }
2751
2752     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2753
2754     bool IsOk() const { return m_ok; }
2755
2756 public:
2757     wxFontEncoding m_enc;
2758     wxEncodingConverter m2w, w2m;
2759
2760 private:
2761     // were we initialized successfully?
2762     bool m_ok;
2763
2764     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2765 };
2766
2767 // make the constructors available for unit testing
2768 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2769 {
2770     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2771     if ( !result->IsOk() )
2772     {
2773         delete result;
2774         return 0;
2775     }
2776
2777     return result;
2778 }
2779
2780 #endif // wxUSE_FONTMAP
2781
2782 // ============================================================================
2783 // wxCSConv implementation
2784 // ============================================================================
2785
2786 void wxCSConv::Init()
2787 {
2788     m_name = NULL;
2789     m_convReal =  NULL;
2790     m_deferred = true;
2791 }
2792
2793 wxCSConv::wxCSConv(const wxString& charset)
2794 {
2795     Init();
2796
2797     if ( !charset.empty() )
2798     {
2799         SetName(charset.ToAscii());
2800     }
2801
2802 #if wxUSE_FONTMAP
2803     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2804 #else
2805     m_encoding = wxFONTENCODING_SYSTEM;
2806 #endif
2807 }
2808
2809 wxCSConv::wxCSConv(wxFontEncoding encoding)
2810 {
2811     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2812     {
2813         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2814
2815         encoding = wxFONTENCODING_SYSTEM;
2816     }
2817
2818     Init();
2819
2820     m_encoding = encoding;
2821 }
2822
2823 wxCSConv::~wxCSConv()
2824 {
2825     Clear();
2826 }
2827
2828 wxCSConv::wxCSConv(const wxCSConv& conv)
2829         : wxMBConv()
2830 {
2831     Init();
2832
2833     SetName(conv.m_name);
2834     m_encoding = conv.m_encoding;
2835 }
2836
2837 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2838 {
2839     Clear();
2840
2841     SetName(conv.m_name);
2842     m_encoding = conv.m_encoding;
2843
2844     return *this;
2845 }
2846
2847 void wxCSConv::Clear()
2848 {
2849     free(m_name);
2850     delete m_convReal;
2851
2852     m_name = NULL;
2853     m_convReal = NULL;
2854 }
2855
2856 void wxCSConv::SetName(const char *charset)
2857 {
2858     if (charset)
2859     {
2860         m_name = strdup(charset);
2861         m_deferred = true;
2862     }
2863 }
2864
2865 #if wxUSE_FONTMAP
2866
2867 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2868                      wxEncodingNameCache );
2869
2870 static wxEncodingNameCache gs_nameCache;
2871 #endif
2872
2873 wxMBConv *wxCSConv::DoCreate() const
2874 {
2875 #if wxUSE_FONTMAP
2876     wxLogTrace(TRACE_STRCONV,
2877                wxT("creating conversion for %s"),
2878                (m_name ? m_name
2879                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2880 #endif // wxUSE_FONTMAP
2881
2882     // check for the special case of ASCII or ISO8859-1 charset: as we have
2883     // special knowledge of it anyhow, we don't need to create a special
2884     // conversion object
2885     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2886             m_encoding == wxFONTENCODING_DEFAULT )
2887     {
2888         // don't convert at all
2889         return NULL;
2890     }
2891
2892     // we trust OS to do conversion better than we can so try external
2893     // conversion methods first
2894     //
2895     // the full order is:
2896     //      1. OS conversion (iconv() under Unix or Win32 API)
2897     //      2. hard coded conversions for UTF
2898     //      3. wxEncodingConverter as fall back
2899
2900     // step (1)
2901 #ifdef HAVE_ICONV
2902 #if !wxUSE_FONTMAP
2903     if ( m_name )
2904 #endif // !wxUSE_FONTMAP
2905     {
2906 #if wxUSE_FONTMAP
2907         wxFontEncoding encoding(m_encoding);
2908 #endif
2909
2910         if ( m_name )
2911         {
2912             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2913             if ( conv->IsOk() )
2914                 return conv;
2915
2916             delete conv;
2917
2918 #if wxUSE_FONTMAP
2919             encoding =
2920                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2921 #endif // wxUSE_FONTMAP
2922         }
2923 #if wxUSE_FONTMAP
2924         {
2925             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2926             if ( it != gs_nameCache.end() )
2927             {
2928                 if ( it->second.empty() )
2929                     return NULL;
2930
2931                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2932                 if ( conv->IsOk() )
2933                     return conv;
2934
2935                 delete conv;
2936             }
2937
2938             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2939             // CS : in case this does not return valid names (eg for MacRoman)
2940             // encoding got a 'failure' entry in the cache all the same,
2941             // although it just has to be created using a different method, so
2942             // only store failed iconv creation attempts (or perhaps we
2943             // shoulnd't do this at all ?)
2944             if ( names[0] != NULL )
2945             {
2946                 for ( ; *names; ++names )
2947                 {
2948                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2949                     //             will need changes that will obsolete this
2950                     wxString name(*names);
2951                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2952                     if ( conv->IsOk() )
2953                     {
2954                         gs_nameCache[encoding] = *names;
2955                         return conv;
2956                     }
2957
2958                     delete conv;
2959                 }
2960
2961                 gs_nameCache[encoding] = _T(""); // cache the failure
2962             }
2963         }
2964 #endif // wxUSE_FONTMAP
2965     }
2966 #endif // HAVE_ICONV
2967
2968 #ifdef wxHAVE_WIN32_MB2WC
2969     {
2970 #if wxUSE_FONTMAP
2971         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2972                                       : new wxMBConv_win32(m_encoding);
2973         if ( conv->IsOk() )
2974             return conv;
2975
2976         delete conv;
2977 #else
2978         return NULL;
2979 #endif
2980     }
2981 #endif // wxHAVE_WIN32_MB2WC
2982
2983 #if defined(__WXMAC__)
2984     {
2985         // leave UTF16 and UTF32 to the built-ins of wx
2986         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2987             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2988         {
2989 #if wxUSE_FONTMAP
2990             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2991                                         : new wxMBConv_mac(m_encoding);
2992 #else
2993             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2994 #endif
2995             if ( conv->IsOk() )
2996                  return conv;
2997
2998             delete conv;
2999         }
3000     }
3001 #endif
3002
3003 #ifdef __DARWIN__
3004     {
3005         // leave UTF16 and UTF32 to the built-ins of wx
3006         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3007             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3008         {
3009 #if wxUSE_FONTMAP
3010             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3011                                           : new wxMBConv_cf(m_encoding);
3012 #else
3013             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3014 #endif
3015
3016             if ( conv->IsOk() )
3017                  return conv;
3018
3019             delete conv;
3020         }
3021     }
3022 #endif // __DARWIN__
3023
3024     // step (2)
3025     wxFontEncoding enc = m_encoding;
3026 #if wxUSE_FONTMAP
3027     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3028     {
3029         // use "false" to suppress interactive dialogs -- we can be called from
3030         // anywhere and popping up a dialog from here is the last thing we want to
3031         // do
3032         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3033     }
3034 #endif // wxUSE_FONTMAP
3035
3036     switch ( enc )
3037     {
3038         case wxFONTENCODING_UTF7:
3039              return new wxMBConvUTF7;
3040
3041         case wxFONTENCODING_UTF8:
3042              return new wxMBConvUTF8;
3043
3044         case wxFONTENCODING_UTF16BE:
3045              return new wxMBConvUTF16BE;
3046
3047         case wxFONTENCODING_UTF16LE:
3048              return new wxMBConvUTF16LE;
3049
3050         case wxFONTENCODING_UTF32BE:
3051              return new wxMBConvUTF32BE;
3052
3053         case wxFONTENCODING_UTF32LE:
3054              return new wxMBConvUTF32LE;
3055
3056         default:
3057              // nothing to do but put here to suppress gcc warnings
3058              break;
3059     }
3060
3061     // step (3)
3062 #if wxUSE_FONTMAP
3063     {
3064         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3065                                       : new wxMBConv_wxwin(m_encoding);
3066         if ( conv->IsOk() )
3067             return conv;
3068
3069         delete conv;
3070     }
3071 #endif // wxUSE_FONTMAP
3072
3073     // NB: This is a hack to prevent deadlock. What could otherwise happen
3074     //     in Unicode build: wxConvLocal creation ends up being here
3075     //     because of some failure and logs the error. But wxLog will try to
3076     //     attach a timestamp, for which it will need wxConvLocal (to convert
3077     //     time to char* and then wchar_t*), but that fails, tries to log the
3078     //     error, but wxLog has an (already locked) critical section that
3079     //     guards the static buffer.
3080     static bool alreadyLoggingError = false;
3081     if (!alreadyLoggingError)
3082     {
3083         alreadyLoggingError = true;
3084         wxLogError(_("Cannot convert from the charset '%s'!"),
3085                    m_name ? m_name
3086                       :
3087 #if wxUSE_FONTMAP
3088                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3089 #else // !wxUSE_FONTMAP
3090                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3091 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3092               );
3093
3094         alreadyLoggingError = false;
3095     }
3096
3097     return NULL;
3098 }
3099
3100 void wxCSConv::CreateConvIfNeeded() const
3101 {
3102     if ( m_deferred )
3103     {
3104         wxCSConv *self = (wxCSConv *)this; // const_cast
3105
3106         // if we don't have neither the name nor the encoding, use the default
3107         // encoding for this system
3108         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3109         {
3110 #if wxUSE_INTL
3111             self->m_encoding = wxLocale::GetSystemEncoding();
3112 #else
3113             // fallback to some reasonable default:
3114             self->m_encoding = wxFONTENCODING_ISO8859_1;
3115 #endif // wxUSE_INTL
3116         }
3117
3118         self->m_convReal = DoCreate();
3119         self->m_deferred = false;
3120     }
3121 }
3122
3123 bool wxCSConv::IsOk() const
3124 {
3125     CreateConvIfNeeded();
3126
3127     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3128     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3129         return true; // always ok as we do it ourselves
3130
3131     // m_convReal->IsOk() is called at its own creation, so we know it must
3132     // be ok if m_convReal is non-NULL
3133     return m_convReal != NULL;
3134 }
3135
3136 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3137                          const char *src, size_t srcLen) const
3138 {
3139     CreateConvIfNeeded();
3140
3141     if (m_convReal)
3142         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3143
3144     // latin-1 (direct)
3145     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3146 }
3147
3148 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3149                            const wchar_t *src, size_t srcLen) const
3150 {
3151     CreateConvIfNeeded();
3152
3153     if (m_convReal)
3154         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3155
3156     // latin-1 (direct)
3157     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3158 }
3159
3160 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3161 {
3162     CreateConvIfNeeded();
3163
3164     if (m_convReal)
3165         return m_convReal->MB2WC(buf, psz, n);
3166
3167     // latin-1 (direct)
3168     size_t len = strlen(psz);
3169
3170     if (buf)
3171     {
3172         for (size_t c = 0; c <= len; c++)
3173             buf[c] = (unsigned char)(psz[c]);
3174     }
3175
3176     return len;
3177 }
3178
3179 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3180 {
3181     CreateConvIfNeeded();
3182
3183     if (m_convReal)
3184         return m_convReal->WC2MB(buf, psz, n);
3185
3186     // latin-1 (direct)
3187     const size_t len = wxWcslen(psz);
3188     if (buf)
3189     {
3190         for (size_t c = 0; c <= len; c++)
3191         {
3192             if (psz[c] > 0xFF)
3193                 return wxCONV_FAILED;
3194
3195             buf[c] = (char)psz[c];
3196         }
3197     }
3198     else
3199     {
3200         for (size_t c = 0; c <= len; c++)
3201         {
3202             if (psz[c] > 0xFF)
3203                 return wxCONV_FAILED;
3204         }
3205     }
3206
3207     return len;
3208 }
3209
3210 size_t wxCSConv::GetMBNulLen() const
3211 {
3212     CreateConvIfNeeded();
3213
3214     if ( m_convReal )
3215     {
3216         return m_convReal->GetMBNulLen();
3217     }
3218
3219     // otherwise, we are ISO-8859-1
3220     return 1;
3221 }
3222
3223 #if wxUSE_UNICODE_UTF8
3224 bool wxCSConv::IsUTF8() const
3225 {
3226     CreateConvIfNeeded();
3227
3228     if ( m_convReal )
3229     {
3230         return m_convReal->IsUTF8();
3231     }
3232
3233     // otherwise, we are ISO-8859-1
3234     return false;
3235 }
3236 #endif
3237
3238
3239 #if wxUSE_UNICODE
3240
3241 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3242 {
3243     if ( !s )
3244         return wxWCharBuffer();
3245
3246     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3247     if ( !wbuf )
3248         wbuf = wxMBConvUTF8().cMB2WX(s);
3249     if ( !wbuf )
3250         wbuf = wxConvISO8859_1.cMB2WX(s);
3251
3252     return wbuf;
3253 }
3254
3255 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3256 {
3257     if ( !ws )
3258         return wxCharBuffer();
3259
3260     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3261     if ( !buf )
3262         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3263
3264     return buf;
3265 }
3266
3267 #endif // wxUSE_UNICODE
3268
3269 // ----------------------------------------------------------------------------
3270 // globals
3271 // ----------------------------------------------------------------------------
3272
3273 // NB: The reason why we create converted objects in this convoluted way,
3274 //     using a factory function instead of global variable, is that they
3275 //     may be used at static initialization time (some of them are used by
3276 //     wxString ctors and there may be a global wxString object). In other
3277 //     words, possibly _before_ the converter global object would be
3278 //     initialized.
3279
3280 #undef wxConvLibc
3281 #undef wxConvUTF8
3282 #undef wxConvUTF7
3283 #undef wxConvLocal
3284 #undef wxConvISO8859_1
3285
3286 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3287     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3288     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3289     {                                                                   \
3290         static impl_klass name##Obj ctor_args;                          \
3291         return &name##Obj;                                              \
3292     }                                                                   \
3293     /* this ensures that all global converter objects are created */    \
3294     /* by the time static initialization is done, i.e. before any */    \
3295     /* thread is launched: */                                           \
3296     static klass* gs_##name##instance = wxGet_##name##Ptr()
3297
3298 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3299     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3300
3301 #ifdef __WINDOWS__
3302     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3303 #elif defined(__WXMAC__) && !defined(__MACH__)
3304     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_mac, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3305 #else
3306     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3307 #endif
3308
3309 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3310 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3311
3312 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3313 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3314
3315 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3316 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3317
3318 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3319 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3320 #endif
3321 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3322 #ifdef __WXOSX__
3323 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3324                                     &wxConvMacUTF8DObj;
3325 #else
3326                                     wxGet_wxConvUTF8Ptr();
3327 #endif
3328 #else // !__WXOSX__
3329                                     wxGet_wxConvLibcPtr();
3330 #endif // __WXOSX__/!__WXOSX__
3331
3332 #else // !wxUSE_WCHAR_T
3333
3334 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3335 // stand-ins in absence of wchar_t
3336 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3337                                 wxConvISO8859_1,
3338                                 wxConvLocal,
3339                                 wxConvUTF8;
3340
3341 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T