src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  82 // be 4 bytes
  83 #if SIZEOF_WCHAR_T == 2
  84     #define WC_UTF16
  85 #endif
  86
  87 // ============================================================================
  88 // implementation
  89 // ============================================================================
  90
  91 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  92 static bool NotAllNULs(const char *p, size_t n)
  93 {
  94     while ( n && *p++ == '\0' )
  95         n--;
  96
  97     return n != 0;
  98 }
  99
 100 // ----------------------------------------------------------------------------
 101 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
 102 // ----------------------------------------------------------------------------
 103
 104 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 105 {
 106     if (input<=0xffff)
 107     {
 108         if (output)
 109             *output = (wxUint16) input;
 110         return 1;
 111     }
 112     else if (input>=0x110000)
 113     {
 114         return wxCONV_FAILED;
 115     }
 116     else
 117     {
 118         if (output)
 119         {
 120             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 121             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 122         }
 123         return 2;
 124     }
 125 }
 126
 127 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 128 {
 129     if ((*input<0xd800) || (*input>0xdfff))
 130     {
 131         output = *input;
 132         return 1;
 133     }
 134     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 135     {
 136         output = *input;
 137         return wxCONV_FAILED;
 138     }
 139     else
 140     {
 141         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 142         return 2;
 143     }
 144 }
 145
 146 #ifdef WC_UTF16
 147     typedef wchar_t wxDecodeSurrogate_t;
 148 #else // !WC_UTF16
 149     typedef wxUint16 wxDecodeSurrogate_t;
 150 #endif // WC_UTF16/!WC_UTF16
 151
 152 // returns the next UTF-32 character from the wchar_t buffer and advances the
 153 // pointer to the character after this one
 154 //
 155 // if an invalid character is found, *pSrc is set to NULL, the caller must
 156 // check for this
 157 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 158 {
 159     wxUint32 out;
 160     const size_t n = decode_utf16(*pSrc, out);
 161     if ( n == wxCONV_FAILED )
 162         *pSrc = NULL;
 163     else
 164         *pSrc += n;
 165
 166     return out;
 167 }
 168
 169 // ----------------------------------------------------------------------------
 170 // wxMBConv
 171 // ----------------------------------------------------------------------------
 172
 173 size_t
 174 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 175                   const char *src, size_t srcLen) const
 176 {
 177     // although new conversion classes are supposed to implement this function
 178     // directly, the existins ones only implement the old MB2WC() and so, to
 179     // avoid to have to rewrite all conversion classes at once, we provide a
 180     // default (but not efficient) implementation of this one in terms of the
 181     // old function by copying the input to ensure that it's NUL-terminated and
 182     // then using MB2WC() to convert it
 183
 184     // the number of chars [which would be] written to dst [if it were not NULL]
 185     size_t dstWritten = 0;
 186
 187     // the number of NULs terminating this string
 188     size_t nulLen wxDUMMY_INITIALIZE(0);
 189
 190     // if we were not given the input size we just have to assume that the
 191     // string is properly terminated as we have no way of knowing how long it
 192     // is anyhow, but if we do have the size check whether there are enough
 193     // NULs at the end
 194     wxCharBuffer bufTmp;
 195     const char *srcEnd;
 196     if ( srcLen != wxNO_LEN )
 197     {
 198         // we need to know how to find the end of this string
 199         nulLen = GetMBNulLen();
 200         if ( nulLen == wxCONV_FAILED )
 201             return wxCONV_FAILED;
 202
 203         // if there are enough NULs we can avoid the copy
 204         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 205         {
 206             // make a copy in order to properly NUL-terminate the string
 207             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 208             char * const p = bufTmp.data();
 209             memcpy(p, src, srcLen);
 210             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 211                 *s = '\0';
 212
 213             src = bufTmp;
 214         }
 215
 216         srcEnd = src + srcLen;
 217     }
 218     else // quit after the first loop iteration
 219     {
 220         srcEnd = NULL;
 221     }
 222
 223     for ( ;; )
 224     {
 225         // try to convert the current chunk
 226         size_t lenChunk = MB2WC(NULL, src, 0);
 227         if ( lenChunk == wxCONV_FAILED )
 228             return wxCONV_FAILED;
 229
 230         lenChunk++; // for the L'\0' at the end of this chunk
 231
 232         dstWritten += lenChunk;
 233
 234         if ( lenChunk == 1 )
 235         {
 236             // nothing left in the input string, conversion succeeded
 237             break;
 238         }
 239
 240         if ( dst )
 241         {
 242             if ( dstWritten > dstLen )
 243                 return wxCONV_FAILED;
 244
 245             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 246                 return wxCONV_FAILED;
 247
 248             dst += lenChunk;
 249         }
 250
 251         if ( !srcEnd )
 252         {
 253             // we convert just one chunk in this case as this is the entire
 254             // string anyhow
 255             break;
 256         }
 257
 258         // advance the input pointer past the end of this chunk
 259         while ( NotAllNULs(src, nulLen) )
 260         {
 261             // notice that we must skip over multiple bytes here as we suppose
 262             // that if NUL takes 2 or 4 bytes, then all the other characters do
 263             // too and so if advanced by a single byte we might erroneously
 264             // detect sequences of NUL bytes in the middle of the input
 265             src += nulLen;
 266         }
 267
 268         src += nulLen; // skipping over its terminator as well
 269
 270         // note that ">=" (and not just "==") is needed here as the terminator
 271         // we skipped just above could be inside or just after the buffer
 272         // delimited by inEnd
 273         if ( src >= srcEnd )
 274             break;
 275     }
 276
 277     return dstWritten;
 278 }
 279
 280 size_t
 281 wxMBConv::FromWChar(char *dst, size_t dstLen,
 282                     const wchar_t *src, size_t srcLen) const
 283 {
 284     // the number of chars [which would be] written to dst [if it were not NULL]
 285     size_t dstWritten = 0;
 286
 287     // make a copy of the input string unless it is already properly
 288     // NUL-terminated
 289     //
 290     // if we don't know its length we have no choice but to assume that it is,
 291     // indeed, properly terminated
 292     wxWCharBuffer bufTmp;
 293     if ( srcLen == wxNO_LEN )
 294     {
 295         srcLen = wxWcslen(src) + 1;
 296     }
 297     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 298     {
 299         // make a copy in order to properly NUL-terminate the string
 300         bufTmp = wxWCharBuffer(srcLen);
 301         memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
 302         src = bufTmp;
 303     }
 304
 305     const size_t lenNul = GetMBNulLen();
 306     for ( const wchar_t * const srcEnd = src + srcLen;
 307           src < srcEnd;
 308           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 309     {
 310         // try to convert the current chunk
 311         size_t lenChunk = WC2MB(NULL, src, 0);
 312
 313         if ( lenChunk == wxCONV_FAILED )
 314             return wxCONV_FAILED;
 315
 316         lenChunk += lenNul;
 317         dstWritten += lenChunk;
 318
 319         if ( dst )
 320         {
 321             if ( dstWritten > dstLen )
 322                 return wxCONV_FAILED;
 323
 324             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 325                 return wxCONV_FAILED;
 326
 327             dst += lenChunk;
 328         }
 329     }
 330
 331     return dstWritten;
 332 }
 333
 334 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
 335 {
 336     size_t rc = ToWChar(out, outLen, in);
 337     if ( rc != wxCONV_FAILED )
 338     {
 339         // ToWChar() returns the buffer length, i.e. including the trailing
 340         // NUL, while this method doesn't take it into account
 341         rc--;
 342     }
 343
 344     return rc;
 345 }
 346
 347 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
 348 {
 349     size_t rc = FromWChar(out, outLen, in);
 350     if ( rc != wxCONV_FAILED )
 351     {
 352         rc -= GetMBNulLen();
 353     }
 354
 355     return rc;
 356 }
 357
 358 wxMBConv::~wxMBConv()
 359 {
 360     // nothing to do here (necessary for Darwin linking probably)
 361 }
 362
 363 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 364 {
 365     if ( psz )
 366     {
 367         // calculate the length of the buffer needed first
 368         const size_t nLen = MB2WC(NULL, psz, 0);
 369         if ( nLen != wxCONV_FAILED )
 370         {
 371             // now do the actual conversion
 372             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 373
 374             // +1 for the trailing NULL
 375             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 376                 return buf;
 377         }
 378     }
 379
 380     return wxWCharBuffer();
 381 }
 382
 383 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 384 {
 385     if ( pwz )
 386     {
 387         const size_t nLen = WC2MB(NULL, pwz, 0);
 388         if ( nLen != wxCONV_FAILED )
 389         {
 390             // extra space for trailing NUL(s)
 391             static const size_t extraLen = GetMaxMBNulLen();
 392
 393             wxCharBuffer buf(nLen + extraLen - 1);
 394             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 395                 return buf;
 396         }
 397     }
 398
 399     return wxCharBuffer();
 400 }
 401
 402 const wxWCharBuffer
 403 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 404 {
 405     const size_t dstLen = ToWChar(NULL, 0, in, inLen);
 406     if ( dstLen != wxCONV_FAILED )
 407     {
 408         wxWCharBuffer wbuf(dstLen - 1);
 409         if ( ToWChar(wbuf.data(), dstLen, in, inLen) != wxCONV_FAILED )
 410         {
 411             if ( outLen )
 412             {
 413                 *outLen = dstLen;
 414                 if ( wbuf[dstLen - 1] == L'\0' )
 415                     (*outLen)--;
 416             }
 417
 418             return wbuf;
 419         }
 420     }
 421
 422     if ( outLen )
 423         *outLen = 0;
 424
 425     return wxWCharBuffer();
 426 }
 427
 428 const wxCharBuffer
 429 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 430 {
 431     const size_t dstLen = FromWChar(NULL, 0, in, inLen);
 432     if ( dstLen != wxCONV_FAILED )
 433     {
 434         wxCharBuffer buf(dstLen - 1);
 435         if ( FromWChar(buf.data(), dstLen, in, inLen) != wxCONV_FAILED )
 436         {
 437             if ( outLen )
 438             {
 439                 *outLen = dstLen;
 440
 441                 const size_t nulLen = GetMBNulLen();
 442                 if ( !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 443                 {
 444                     // in this case the output is NUL-terminated and we're not
 445                     // supposed to count NUL
 446                     (*outLen) -= nulLen;
 447                 }
 448             }
 449
 450             return buf;
 451         }
 452     }
 453
 454     if ( outLen )
 455         *outLen = 0;
 456
 457     return wxCharBuffer();
 458 }
 459
 460 // ----------------------------------------------------------------------------
 461 // wxMBConvLibc
 462 // ----------------------------------------------------------------------------
 463
 464 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 465 {
 466     return wxMB2WC(buf, psz, n);
 467 }
 468
 469 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 470 {
 471     return wxWC2MB(buf, psz, n);
 472 }
 473
 474 // ----------------------------------------------------------------------------
 475 // wxConvBrokenFileNames
 476 // ----------------------------------------------------------------------------
 477
 478 #ifdef __UNIX__
 479
 480 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 481 {
 482     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 483                   || wxStricmp(charset, _T("UTF8")) == 0  )
 484         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 485     else
 486         m_conv = new wxCSConv(charset);
 487 }
 488
 489 #endif // __UNIX__
 490
 491 // ----------------------------------------------------------------------------
 492 // UTF-7
 493 // ----------------------------------------------------------------------------
 494
 495 // Implementation (C) 2004 Fredrik Roubert
 496
 497 //
 498 // BASE64 decoding table
 499 //
 500 static const unsigned char utf7unb64[] =
 501 {
 502     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 506     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 508     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 509     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 511     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 512     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 513     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 515     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 516     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 517     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 533     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 534 };
 535
 536 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 537 {
 538     size_t len = 0;
 539
 540     while ( *psz && (!buf || (len < n)) )
 541     {
 542         unsigned char cc = *psz++;
 543         if (cc != '+')
 544         {
 545             // plain ASCII char
 546             if (buf)
 547                 *buf++ = cc;
 548             len++;
 549         }
 550         else if (*psz == '-')
 551         {
 552             // encoded plus sign
 553             if (buf)
 554                 *buf++ = cc;
 555             len++;
 556             psz++;
 557         }
 558         else // start of BASE64 encoded string
 559         {
 560             bool lsb, ok;
 561             unsigned int d, l;
 562             for ( ok = lsb = false, d = 0, l = 0;
 563                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 564                   psz++ )
 565             {
 566                 d <<= 6;
 567                 d += cc;
 568                 for (l += 6; l >= 8; lsb = !lsb)
 569                 {
 570                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 571                     if (lsb)
 572                     {
 573                         if (buf)
 574                             *buf++ |= c;
 575                         len ++;
 576                     }
 577                     else
 578                     {
 579                         if (buf)
 580                             *buf = (wchar_t)(c << 8);
 581                     }
 582
 583                     ok = true;
 584                 }
 585             }
 586
 587             if ( !ok )
 588             {
 589                 // in valid UTF7 we should have valid characters after '+'
 590                 return wxCONV_FAILED;
 591             }
 592
 593             if (*psz == '-')
 594                 psz++;
 595         }
 596     }
 597
 598     if ( buf && (len < n) )
 599         *buf = '\0';
 600
 601     return len;
 602 }
 603
 604 //
 605 // BASE64 encoding table
 606 //
 607 static const unsigned char utf7enb64[] =
 608 {
 609     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 610     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 611     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 612     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 613     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 614     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 615     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 616     '4', '5', '6', '7', '8', '9', '+', '/'
 617 };
 618
 619 //
 620 // UTF-7 encoding table
 621 //
 622 // 0 - Set D (directly encoded characters)
 623 // 1 - Set O (optional direct characters)
 624 // 2 - whitespace characters (optional)
 625 // 3 - special characters
 626 //
 627 static const unsigned char utf7encode[128] =
 628 {
 629     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 630     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 631     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 632     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 633     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 634     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 635     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 636     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 637 };
 638
 639 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 640 {
 641     size_t len = 0;
 642
 643     while (*psz && ((!buf) || (len < n)))
 644     {
 645         wchar_t cc = *psz++;
 646         if (cc < 0x80 && utf7encode[cc] < 1)
 647         {
 648             // plain ASCII char
 649             if (buf)
 650                 *buf++ = (char)cc;
 651             len++;
 652         }
 653 #ifndef WC_UTF16
 654         else if (((wxUint32)cc) > 0xffff)
 655         {
 656             // no surrogate pair generation (yet?)
 657             return wxCONV_FAILED;
 658         }
 659 #endif
 660         else
 661         {
 662             if (buf)
 663                 *buf++ = '+';
 664             len++;
 665             if (cc != '+')
 666             {
 667                 // BASE64 encode string
 668                 unsigned int lsb, d, l;
 669                 for (d = 0, l = 0; /*nothing*/; psz++)
 670                 {
 671                     for (lsb = 0; lsb < 2; lsb ++)
 672                     {
 673                         d <<= 8;
 674                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 675
 676                         for (l += 8; l >= 6; )
 677                         {
 678                             l -= 6;
 679                             if (buf)
 680                                 *buf++ = utf7enb64[(d >> l) % 64];
 681                             len++;
 682                         }
 683                     }
 684                     cc = *psz;
 685                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 686                         break;
 687                 }
 688                 if (l != 0)
 689                 {
 690                     if (buf)
 691                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 692                     len++;
 693                 }
 694             }
 695             if (buf)
 696                 *buf++ = '-';
 697             len++;
 698         }
 699     }
 700     if (buf && (len < n))
 701         *buf = 0;
 702     return len;
 703 }
 704
 705 // ----------------------------------------------------------------------------
 706 // UTF-8
 707 // ----------------------------------------------------------------------------
 708
 709 static wxUint32 utf8_max[]=
 710     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 711
 712 // boundaries of the private use area we use to (temporarily) remap invalid
 713 // characters invalid in a UTF-8 encoded string
 714 const wxUint32 wxUnicodePUA = 0x100000;
 715 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 716
 717 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 718 {
 719     size_t len = 0;
 720
 721     while (*psz && ((!buf) || (len < n)))
 722     {
 723         const char *opsz = psz;
 724         bool invalid = false;
 725         unsigned char cc = *psz++, fc = cc;
 726         unsigned cnt;
 727         for (cnt = 0; fc & 0x80; cnt++)
 728             fc <<= 1;
 729         if (!cnt)
 730         {
 731             // plain ASCII char
 732             if (buf)
 733                 *buf++ = cc;
 734             len++;
 735
 736             // escape the escape character for octal escapes
 737             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 738                     && cc == '\\' && (!buf || len < n))
 739             {
 740                 if (buf)
 741                     *buf++ = cc;
 742                 len++;
 743             }
 744         }
 745         else
 746         {
 747             cnt--;
 748             if (!cnt)
 749             {
 750                 // invalid UTF-8 sequence
 751                 invalid = true;
 752             }
 753             else
 754             {
 755                 unsigned ocnt = cnt - 1;
 756                 wxUint32 res = cc & (0x3f >> cnt);
 757                 while (cnt--)
 758                 {
 759                     cc = *psz;
 760                     if ((cc & 0xC0) != 0x80)
 761                     {
 762                         // invalid UTF-8 sequence
 763                         invalid = true;
 764                         break;
 765                     }
 766                     psz++;
 767                     res = (res << 6) | (cc & 0x3f);
 768                 }
 769                 if (invalid || res <= utf8_max[ocnt])
 770                 {
 771                     // illegal UTF-8 encoding
 772                     invalid = true;
 773                 }
 774                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 775                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 776                 {
 777                     // if one of our PUA characters turns up externally
 778                     // it must also be treated as an illegal sequence
 779                     // (a bit like you have to escape an escape character)
 780                     invalid = true;
 781                 }
 782                 else
 783                 {
 784 #ifdef WC_UTF16
 785                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 786                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 787                     if (pa == wxCONV_FAILED)
 788                     {
 789                         invalid = true;
 790                     }
 791                     else
 792                     {
 793                         if (buf)
 794                             buf += pa;
 795                         len += pa;
 796                     }
 797 #else // !WC_UTF16
 798                     if (buf)
 799                         *buf++ = (wchar_t)res;
 800                     len++;
 801 #endif // WC_UTF16/!WC_UTF16
 802                 }
 803             }
 804             if (invalid)
 805             {
 806                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 807                 {
 808                     while (opsz < psz && (!buf || len < n))
 809                     {
 810 #ifdef WC_UTF16
 811                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 812                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 813                         wxASSERT(pa != wxCONV_FAILED);
 814                         if (buf)
 815                             buf += pa;
 816                         opsz++;
 817                         len += pa;
 818 #else
 819                         if (buf)
 820                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 821                         opsz++;
 822                         len++;
 823 #endif
 824                     }
 825                 }
 826                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 827                 {
 828                     while (opsz < psz && (!buf || len < n))
 829                     {
 830                         if ( buf && len + 3 < n )
 831                         {
 832                             unsigned char on = *opsz;
 833                             *buf++ = L'\\';
 834                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 835                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 836                             *buf++ = (wchar_t)( L'0' + on % 010 );
 837                         }
 838                         opsz++;
 839                         len += 4;
 840                     }
 841                 }
 842                 else // MAP_INVALID_UTF8_NOT
 843                 {
 844                     return wxCONV_FAILED;
 845                 }
 846             }
 847         }
 848     }
 849     if (buf && (len < n))
 850         *buf = 0;
 851     return len;
 852 }
 853
 854 static inline bool isoctal(wchar_t wch)
 855 {
 856     return L'0' <= wch && wch <= L'7';
 857 }
 858
 859 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 860 {
 861     size_t len = 0;
 862
 863     while (*psz && ((!buf) || (len < n)))
 864     {
 865         wxUint32 cc;
 866 #ifdef WC_UTF16
 867         // cast is ok for WC_UTF16
 868         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 869         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 870 #else
 871         cc=(*psz++) & 0x7fffffff;
 872 #endif
 873
 874         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 875                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 876         {
 877             if (buf)
 878                 *buf++ = (char)(cc - wxUnicodePUA);
 879             len++;
 880         }
 881         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 882                     && cc == L'\\' && psz[0] == L'\\' )
 883         {
 884             if (buf)
 885                 *buf++ = (char)cc;
 886             psz++;
 887             len++;
 888         }
 889         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 890                     cc == L'\\' &&
 891                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 892         {
 893             if (buf)
 894             {
 895                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 896                                  (psz[1] - L'0')*010 +
 897                                  (psz[2] - L'0'));
 898             }
 899
 900             psz += 3;
 901             len++;
 902         }
 903         else
 904         {
 905             unsigned cnt;
 906             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 907             if (!cnt)
 908             {
 909                 // plain ASCII char
 910                 if (buf)
 911                     *buf++ = (char) cc;
 912                 len++;
 913             }
 914
 915             else
 916             {
 917                 len += cnt + 1;
 918                 if (buf)
 919                 {
 920                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 921                     while (cnt--)
 922                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 923                 }
 924             }
 925         }
 926     }
 927
 928     if (buf && (len<n))
 929         *buf = 0;
 930
 931     return len;
 932 }
 933
 934 // ============================================================================
 935 // UTF-16
 936 // ============================================================================
 937
 938 #ifdef WORDS_BIGENDIAN
 939     #define wxMBConvUTF16straight wxMBConvUTF16BE
 940     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 941 #else
 942     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 943     #define wxMBConvUTF16straight wxMBConvUTF16LE
 944 #endif
 945
 946 /* static */
 947 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 948 {
 949     if ( srcLen == wxNO_LEN )
 950     {
 951         // count the number of bytes in input, including the trailing NULs
 952         const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
 953         for ( srcLen = 1; *in++; srcLen++ )
 954             ;
 955
 956         srcLen *= BYTES_PER_CHAR;
 957     }
 958     else // we already have the length
 959     {
 960         // we can only convert an entire number of UTF-16 characters
 961         if ( srcLen % BYTES_PER_CHAR )
 962             return wxCONV_FAILED;
 963     }
 964
 965     return srcLen;
 966 }
 967
 968 // case when in-memory representation is UTF-16 too
 969 #ifdef WC_UTF16
 970
 971 // ----------------------------------------------------------------------------
 972 // conversions without endianness change
 973 // ----------------------------------------------------------------------------
 974
 975 size_t
 976 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 977                                const char *src, size_t srcLen) const
 978 {
 979     // set up the scene for using memcpy() (which is presumably more efficient
 980     // than copying the bytes one by one)
 981     srcLen = GetLength(src, srcLen);
 982     if ( srcLen == wxNO_LEN )
 983         return wxCONV_FAILED;
 984
 985     const size_t inLen = srcLen/BYTES_PER_CHAR;
 986     if ( dst )
 987     {
 988         if ( dstLen < inLen )
 989             return wxCONV_FAILED;
 990
 991         memcpy(dst, src, srcLen);
 992     }
 993
 994     return inLen;
 995 }
 996
 997 size_t
 998 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
 999                                  const wchar_t *src, size_t srcLen) const
1000 {
1001     if ( srcLen == wxNO_LEN )
1002         srcLen = wxWcslen(src) + 1;
1003
1004     srcLen *= BYTES_PER_CHAR;
1005
1006     if ( dst )
1007     {
1008         if ( dstLen < srcLen )
1009             return wxCONV_FAILED;
1010
1011         memcpy(dst, src, srcLen);
1012     }
1013
1014     return srcLen;
1015 }
1016
1017 // ----------------------------------------------------------------------------
1018 // endian-reversing conversions
1019 // ----------------------------------------------------------------------------
1020
1021 size_t
1022 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1023                            const char *src, size_t srcLen) const
1024 {
1025     srcLen = GetLength(src, srcLen);
1026     if ( srcLen == wxNO_LEN )
1027         return wxCONV_FAILED;
1028
1029     srcLen /= BYTES_PER_CHAR;
1030
1031     if ( dst )
1032     {
1033         if ( dstLen < srcLen )
1034             return wxCONV_FAILED;
1035
1036         const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
1037         for ( size_t n = 0; n < srcLen; n++, in++ )
1038         {
1039             *dst++ = wxUINT16_SWAP_ALWAYS(*in);
1040         }
1041     }
1042
1043     return srcLen;
1044 }
1045
1046 size_t
1047 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1048                              const wchar_t *src, size_t srcLen) const
1049 {
1050     if ( srcLen == wxNO_LEN )
1051         srcLen = wxWcslen(src) + 1;
1052
1053     srcLen *= BYTES_PER_CHAR;
1054
1055     if ( dst )
1056     {
1057         if ( dstLen < srcLen )
1058             return wxCONV_FAILED;
1059
1060         wxUint16 *out = wx_reinterpret_cast(wxUint16 *, dst);
1061         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1062         {
1063             *out++ = wxUINT16_SWAP_ALWAYS(*src);
1064         }
1065     }
1066
1067     return srcLen;
1068 }
1069
1070 #else // !WC_UTF16: wchar_t is UTF-32
1071
1072 // ----------------------------------------------------------------------------
1073 // conversions without endianness change
1074 // ----------------------------------------------------------------------------
1075
1076 size_t
1077 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1078                                const char *src, size_t srcLen) const
1079 {
1080     srcLen = GetLength(src, srcLen);
1081     if ( srcLen == wxNO_LEN )
1082         return wxCONV_FAILED;
1083
1084     const size_t inLen = srcLen/BYTES_PER_CHAR;
1085     if ( !dst )
1086     {
1087         // optimization: return maximal space which could be needed for this
1088         // string even if the real size could be smaller if the buffer contains
1089         // any surrogates
1090         return inLen;
1091     }
1092
1093     size_t outLen = 0;
1094     const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
1095     for ( const wxUint16 * const inEnd = in + inLen; in < inEnd; )
1096     {
1097         const wxUint32 ch = wxDecodeSurrogate(&in);
1098         if ( !in )
1099             return wxCONV_FAILED;
1100
1101         if ( ++outLen > dstLen )
1102             return wxCONV_FAILED;
1103
1104         *dst++ = ch;
1105     }
1106
1107
1108     return outLen;
1109 }
1110
1111 size_t
1112 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1113                                  const wchar_t *src, size_t srcLen) const
1114 {
1115     if ( srcLen == wxNO_LEN )
1116         srcLen = wxWcslen(src) + 1;
1117
1118     size_t outLen = 0;
1119     wxUint16 *out = wx_reinterpret_cast(wxUint16 *, dst);
1120     for ( size_t n = 0; n < srcLen; n++ )
1121     {
1122         wxUint16 cc[2];
1123         const size_t numChars = encode_utf16(*src++, cc);
1124         if ( numChars == wxCONV_FAILED )
1125             return wxCONV_FAILED;
1126
1127         outLen += numChars*BYTES_PER_CHAR;
1128         if ( out )
1129         {
1130             if ( outLen > dstLen )
1131                 return wxCONV_FAILED;
1132
1133             *out++ = cc[0];
1134             if ( numChars == 2 )
1135             {
1136                 // second character of a surrogate
1137                 *out++ = cc[1];
1138             }
1139         }
1140     }
1141
1142     return outLen;
1143 }
1144
1145 // ----------------------------------------------------------------------------
1146 // endian-reversing conversions
1147 // ----------------------------------------------------------------------------
1148
1149 size_t
1150 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1151                            const char *src, size_t srcLen) const
1152 {
1153     srcLen = GetLength(src, srcLen);
1154     if ( srcLen == wxNO_LEN )
1155         return wxCONV_FAILED;
1156
1157     const size_t inLen = srcLen/BYTES_PER_CHAR;
1158     if ( !dst )
1159     {
1160         // optimization: return maximal space which could be needed for this
1161         // string even if the real size could be smaller if the buffer contains
1162         // any surrogates
1163         return inLen;
1164     }
1165
1166     size_t outLen = 0;
1167     const wxUint16 *in = wx_reinterpret_cast(const wxUint16 *, src);
1168     for ( const wxUint16 * const inEnd = in + inLen; in < inEnd; )
1169     {
1170         wxUint32 ch;
1171         wxUint16 tmp[2];
1172         tmp[0] = wxUINT16_SWAP_ALWAYS(*in);
1173         in++;
1174         tmp[1] = wxUINT16_SWAP_ALWAYS(*in);
1175
1176         const size_t numChars = decode_utf16(tmp, ch);
1177         if ( numChars == wxCONV_FAILED )
1178             return wxCONV_FAILED;
1179
1180         if ( numChars == 2 )
1181             in++;
1182
1183         if ( ++outLen > dstLen )
1184             return wxCONV_FAILED;
1185
1186         *dst++ = ch;
1187     }
1188
1189
1190     return outLen;
1191 }
1192
1193 size_t
1194 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1195                              const wchar_t *src, size_t srcLen) const
1196 {
1197     if ( srcLen == wxNO_LEN )
1198         srcLen = wxWcslen(src) + 1;
1199
1200     size_t outLen = 0;
1201     wxUint16 *out = wx_reinterpret_cast(wxUint16 *, dst);
1202     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1203     {
1204         wxUint16 cc[2];
1205         const size_t numChars = encode_utf16(*src, cc);
1206         if ( numChars == wxCONV_FAILED )
1207             return wxCONV_FAILED;
1208
1209         outLen += numChars*BYTES_PER_CHAR;
1210         if ( out )
1211         {
1212             if ( outLen > dstLen )
1213                 return wxCONV_FAILED;
1214
1215             *out++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1216             if ( numChars == 2 )
1217             {
1218                 // second character of a surrogate
1219                 *out++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1220             }
1221         }
1222     }
1223
1224     return outLen;
1225 }
1226
1227 #endif // WC_UTF16/!WC_UTF16
1228
1229
1230 // ============================================================================
1231 // UTF-32
1232 // ============================================================================
1233
1234 #ifdef WORDS_BIGENDIAN
1235     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1236     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1237 #else
1238     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1239     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1240 #endif
1241
1242
1243 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1244 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1245
1246 /* static */
1247 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1248 {
1249     if ( srcLen == wxNO_LEN )
1250     {
1251         // count the number of bytes in input, including the trailing NULs
1252         const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1253         for ( srcLen = 1; *in++; srcLen++ )
1254             ;
1255
1256         srcLen *= BYTES_PER_CHAR;
1257     }
1258     else // we already have the length
1259     {
1260         // we can only convert an entire number of UTF-32 characters
1261         if ( srcLen % BYTES_PER_CHAR )
1262             return wxCONV_FAILED;
1263     }
1264
1265     return srcLen;
1266 }
1267
1268 // case when in-memory representation is UTF-16
1269 #ifdef WC_UTF16
1270
1271 // ----------------------------------------------------------------------------
1272 // conversions without endianness change
1273 // ----------------------------------------------------------------------------
1274
1275 size_t
1276 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1277                                const char *src, size_t srcLen) const
1278 {
1279     srcLen = GetLength(src, srcLen);
1280     if ( srcLen == wxNO_LEN )
1281         return wxCONV_FAILED;
1282
1283     const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1284     const size_t inLen = srcLen/BYTES_PER_CHAR;
1285     size_t outLen = 0;
1286     for ( size_t n = 0; n < inLen; n++ )
1287     {
1288         wxUint16 cc[2];
1289         const size_t numChars = encode_utf16(*in++, cc);
1290         if ( numChars == wxCONV_FAILED )
1291             return wxCONV_FAILED;
1292
1293         outLen += numChars;
1294         if ( dst )
1295         {
1296             if ( outLen > dstLen )
1297                 return wxCONV_FAILED;
1298
1299             *dst++ = cc[0];
1300             if ( numChars == 2 )
1301             {
1302                 // second character of a surrogate
1303                 *dst++ = cc[1];
1304             }
1305         }
1306     }
1307
1308     return outLen;
1309 }
1310
1311 size_t
1312 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1313                                  const wchar_t *src, size_t srcLen) const
1314 {
1315     if ( srcLen == wxNO_LEN )
1316         srcLen = wxWcslen(src) + 1;
1317
1318     if ( !dst )
1319     {
1320         // optimization: return maximal space which could be needed for this
1321         // string instead of the exact amount which could be less if there are
1322         // any surrogates in the input
1323         //
1324         // we consider that surrogates are rare enough to make it worthwhile to
1325         // avoid running the loop below at the cost of slightly extra memory
1326         // consumption
1327         return srcLen*BYTES_PER_CHAR;
1328     }
1329
1330     wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1331     size_t outLen = 0;
1332     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1333     {
1334         const wxUint32 ch = wxDecodeSurrogate(&src);
1335         if ( !src )
1336             return wxCONV_FAILED;
1337
1338         outLen += BYTES_PER_CHAR;
1339
1340         if ( outLen > dstLen )
1341             return wxCONV_FAILED;
1342
1343         *out++ = ch;
1344     }
1345
1346     return outLen;
1347 }
1348
1349 // ----------------------------------------------------------------------------
1350 // endian-reversing conversions
1351 // ----------------------------------------------------------------------------
1352
1353 size_t
1354 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1355                            const char *src, size_t srcLen) const
1356 {
1357     srcLen = GetLength(src, srcLen);
1358     if ( srcLen == wxNO_LEN )
1359         return wxCONV_FAILED;
1360
1361     const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1362     const size_t inLen = srcLen/BYTES_PER_CHAR;
1363     size_t outLen = 0;
1364     for ( size_t n = 0; n < inLen; n++, in++ )
1365     {
1366         wxUint16 cc[2];
1367         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*in), cc);
1368         if ( numChars == wxCONV_FAILED )
1369             return wxCONV_FAILED;
1370
1371         outLen += numChars;
1372         if ( dst )
1373         {
1374             if ( outLen > dstLen )
1375                 return wxCONV_FAILED;
1376
1377             *dst++ = cc[0];
1378             if ( numChars == 2 )
1379             {
1380                 // second character of a surrogate
1381                 *dst++ = cc[1];
1382             }
1383         }
1384     }
1385
1386     return outLen;
1387 }
1388
1389 size_t
1390 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1391                              const wchar_t *src, size_t srcLen) const
1392 {
1393     if ( srcLen == wxNO_LEN )
1394         srcLen = wxWcslen(src) + 1;
1395
1396     if ( !dst )
1397     {
1398         // optimization: return maximal space which could be needed for this
1399         // string instead of the exact amount which could be less if there are
1400         // any surrogates in the input
1401         //
1402         // we consider that surrogates are rare enough to make it worthwhile to
1403         // avoid running the loop below at the cost of slightly extra memory
1404         // consumption
1405         return srcLen*BYTES_PER_CHAR;
1406     }
1407
1408     wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1409     size_t outLen = 0;
1410     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1411     {
1412         const wxUint32 ch = wxDecodeSurrogate(&src);
1413         if ( !src )
1414             return wxCONV_FAILED;
1415
1416         outLen += BYTES_PER_CHAR;
1417
1418         if ( outLen > dstLen )
1419             return wxCONV_FAILED;
1420
1421         *out++ = wxUINT32_SWAP_ALWAYS(ch);
1422     }
1423
1424     return outLen;
1425 }
1426
1427 #else // !WC_UTF16: wchar_t is UTF-32
1428
1429 // ----------------------------------------------------------------------------
1430 // conversions without endianness change
1431 // ----------------------------------------------------------------------------
1432
1433 size_t
1434 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1435                                const char *src, size_t srcLen) const
1436 {
1437     // use memcpy() as it should be much faster than hand-written loop
1438     srcLen = GetLength(src, srcLen);
1439     if ( srcLen == wxNO_LEN )
1440         return wxCONV_FAILED;
1441
1442     const size_t inLen = srcLen/BYTES_PER_CHAR;
1443     if ( dst )
1444     {
1445         if ( dstLen < inLen )
1446             return wxCONV_FAILED;
1447
1448         memcpy(dst, src, srcLen);
1449     }
1450
1451     return inLen;
1452 }
1453
1454 size_t
1455 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1456                                  const wchar_t *src, size_t srcLen) const
1457 {
1458     if ( srcLen == wxNO_LEN )
1459         srcLen = wxWcslen(src) + 1;
1460
1461     srcLen *= BYTES_PER_CHAR;
1462
1463     if ( dst )
1464     {
1465         if ( dstLen < srcLen )
1466             return wxCONV_FAILED;
1467
1468         memcpy(dst, src, srcLen);
1469     }
1470
1471     return srcLen;
1472 }
1473
1474 // ----------------------------------------------------------------------------
1475 // endian-reversing conversions
1476 // ----------------------------------------------------------------------------
1477
1478 size_t
1479 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1480                            const char *src, size_t srcLen) const
1481 {
1482     srcLen = GetLength(src, srcLen);
1483     if ( srcLen == wxNO_LEN )
1484         return wxCONV_FAILED;
1485
1486     srcLen /= BYTES_PER_CHAR;
1487
1488     if ( dst )
1489     {
1490         if ( dstLen < srcLen )
1491             return wxCONV_FAILED;
1492
1493         const wxUint32 *in = wx_reinterpret_cast(const wxUint32 *, src);
1494         for ( size_t n = 0; n < srcLen; n++, in++ )
1495         {
1496             *dst++ = wxUINT32_SWAP_ALWAYS(*in);
1497         }
1498     }
1499
1500     return srcLen;
1501 }
1502
1503 size_t
1504 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1505                              const wchar_t *src, size_t srcLen) const
1506 {
1507     if ( srcLen == wxNO_LEN )
1508         srcLen = wxWcslen(src) + 1;
1509
1510     srcLen *= BYTES_PER_CHAR;
1511
1512     if ( dst )
1513     {
1514         if ( dstLen < srcLen )
1515             return wxCONV_FAILED;
1516
1517         wxUint32 *out = wx_reinterpret_cast(wxUint32 *, dst);
1518         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1519         {
1520             *out++ = wxUINT32_SWAP_ALWAYS(*src);
1521         }
1522     }
1523
1524     return srcLen;
1525 }
1526
1527 #endif // WC_UTF16/!WC_UTF16
1528
1529
1530 // ============================================================================
1531 // The classes doing conversion using the iconv_xxx() functions
1532 // ============================================================================
1533
1534 #ifdef HAVE_ICONV
1535
1536 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1537 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1538 //     (unless there's yet another bug in glibc) the only case when iconv()
1539 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1540 //     left in the input buffer -- when _real_ error occurs,
1541 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1542 //     iconv() failure.
1543 //     [This bug does not appear in glibc 2.2.]
1544 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1545 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1546                                      (errno != E2BIG || bufLeft != 0))
1547 #else
1548 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1549 #endif
1550
1551 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1552
1553 #define ICONV_T_INVALID ((iconv_t)-1)
1554
1555 #if SIZEOF_WCHAR_T == 4
1556     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1557     #define WC_ENC      wxFONTENCODING_UTF32
1558 #elif SIZEOF_WCHAR_T == 2
1559     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1560     #define WC_ENC      wxFONTENCODING_UTF16
1561 #else // sizeof(wchar_t) != 2 nor 4
1562     // does this ever happen?
1563     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1564 #endif
1565
1566 // ----------------------------------------------------------------------------
1567 // wxMBConv_iconv: encapsulates an iconv character set
1568 // ----------------------------------------------------------------------------
1569
1570 class wxMBConv_iconv : public wxMBConv
1571 {
1572 public:
1573     wxMBConv_iconv(const wxChar *name);
1574     virtual ~wxMBConv_iconv();
1575
1576     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1577     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1578
1579     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1580     virtual size_t GetMBNulLen() const;
1581
1582     virtual wxMBConv *Clone() const
1583     {
1584         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1585         p->m_minMBCharWidth = m_minMBCharWidth;
1586         return p;
1587     }
1588
1589     bool IsOk() const
1590         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1591
1592 protected:
1593     // the iconv handlers used to translate from multibyte to wide char and in
1594     // the other direction
1595     iconv_t m2w,
1596             w2m;
1597 #if wxUSE_THREADS
1598     // guards access to m2w and w2m objects
1599     wxMutex m_iconvMutex;
1600 #endif
1601
1602 private:
1603     // the name (for iconv_open()) of a wide char charset -- if none is
1604     // available on this machine, it will remain NULL
1605     static wxString ms_wcCharsetName;
1606
1607     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1608     // different endian-ness than the native one
1609     static bool ms_wcNeedsSwap;
1610
1611
1612     // name of the encoding handled by this conversion
1613     wxString m_name;
1614
1615     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1616     // initially
1617     size_t m_minMBCharWidth;
1618 };
1619
1620 // make the constructor available for unit testing
1621 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1622 {
1623     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1624     if ( !result->IsOk() )
1625     {
1626         delete result;
1627         return 0;
1628     }
1629     return result;
1630 }
1631
1632 wxString wxMBConv_iconv::ms_wcCharsetName;
1633 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1634
1635 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1636               : m_name(name)
1637 {
1638     m_minMBCharWidth = 0;
1639
1640     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1641     // names for the charsets
1642     const wxCharBuffer cname(wxString(name).ToAscii());
1643
1644     // check for charset that represents wchar_t:
1645     if ( ms_wcCharsetName.empty() )
1646     {
1647         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1648
1649 #if wxUSE_FONTMAP
1650         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1651 #else // !wxUSE_FONTMAP
1652         static const wxChar *names[] =
1653         {
1654 #if SIZEOF_WCHAR_T == 4
1655             _T("UCS-4"),
1656 #elif SIZEOF_WCHAR_T = 2
1657             _T("UCS-2"),
1658 #endif
1659             NULL
1660         };
1661 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1662
1663         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1664         {
1665             const wxString nameCS(*names);
1666
1667             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1668             wxString nameXE(nameCS);
1669             #ifdef WORDS_BIGENDIAN
1670                 nameXE += _T("BE");
1671             #else // little endian
1672                 nameXE += _T("LE");
1673             #endif
1674
1675             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1676                        nameXE.c_str());
1677
1678             m2w = iconv_open(nameXE.ToAscii(), cname);
1679             if ( m2w == ICONV_T_INVALID )
1680             {
1681                 // try charset w/o bytesex info (e.g. "UCS4")
1682                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1683                            nameCS.c_str());
1684                 m2w = iconv_open(nameCS.ToAscii(), cname);
1685
1686                 // and check for bytesex ourselves:
1687                 if ( m2w != ICONV_T_INVALID )
1688                 {
1689                     char    buf[2], *bufPtr;
1690                     wchar_t wbuf[2], *wbufPtr;
1691                     size_t  insz, outsz;
1692                     size_t  res;
1693
1694                     buf[0] = 'A';
1695                     buf[1] = 0;
1696                     wbuf[0] = 0;
1697                     insz = 2;
1698                     outsz = SIZEOF_WCHAR_T * 2;
1699                     wbufPtr = wbuf;
1700                     bufPtr = buf;
1701
1702                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1703                                 (char**)&wbufPtr, &outsz);
1704
1705                     if (ICONV_FAILED(res, insz))
1706                     {
1707                         wxLogLastError(wxT("iconv"));
1708                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1709                                    nameCS.c_str());
1710                     }
1711                     else // ok, can convert to this encoding, remember it
1712                     {
1713                         ms_wcCharsetName = nameCS;
1714                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1715                     }
1716                 }
1717             }
1718             else // use charset not requiring byte swapping
1719             {
1720                 ms_wcCharsetName = nameXE;
1721             }
1722         }
1723
1724         wxLogTrace(TRACE_STRCONV,
1725                    wxT("iconv wchar_t charset is \"%s\"%s"),
1726                    ms_wcCharsetName.empty() ? _T("<none>")
1727                                             : ms_wcCharsetName.c_str(),
1728                    ms_wcNeedsSwap ? _T(" (needs swap)")
1729                                   : _T(""));
1730     }
1731     else // we already have ms_wcCharsetName
1732     {
1733         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1734     }
1735
1736     if ( ms_wcCharsetName.empty() )
1737     {
1738         w2m = ICONV_T_INVALID;
1739     }
1740     else
1741     {
1742         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1743         if ( w2m == ICONV_T_INVALID )
1744         {
1745             wxLogTrace(TRACE_STRCONV,
1746                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1747                        ms_wcCharsetName.c_str(), cname.data());
1748         }
1749     }
1750 }
1751
1752 wxMBConv_iconv::~wxMBConv_iconv()
1753 {
1754     if ( m2w != ICONV_T_INVALID )
1755         iconv_close(m2w);
1756     if ( w2m != ICONV_T_INVALID )
1757         iconv_close(w2m);
1758 }
1759
1760 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1761 {
1762     // find the string length: notice that must be done differently for
1763     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1764     size_t inbuf;
1765     const size_t nulLen = GetMBNulLen();
1766     switch ( nulLen )
1767     {
1768         default:
1769             return wxCONV_FAILED;
1770
1771         case 1:
1772             inbuf = strlen(psz); // arguably more optimized than our version
1773             break;
1774
1775         case 2:
1776         case 4:
1777             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1778             // they also have to start at character boundary and not span two
1779             // adjacent characters
1780             const char *p;
1781             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1782                 ;
1783             inbuf = p - psz;
1784             break;
1785     }
1786
1787 #if wxUSE_THREADS
1788     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1789     //     Unfortunately there is a couple of global wxCSConv objects such as
1790     //     wxConvLocal that are used all over wx code, so we have to make sure
1791     //     the handle is used by at most one thread at the time. Otherwise
1792     //     only a few wx classes would be safe to use from non-main threads
1793     //     as MB<->WC conversion would fail "randomly".
1794     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1795 #endif // wxUSE_THREADS
1796
1797
1798     size_t outbuf = n * SIZEOF_WCHAR_T;
1799     size_t res, cres;
1800     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1801     wchar_t *bufPtr = buf;
1802     const char *pszPtr = psz;
1803
1804     if (buf)
1805     {
1806         // have destination buffer, convert there
1807         cres = iconv(m2w,
1808                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1809                      (char**)&bufPtr, &outbuf);
1810         res = n - (outbuf / SIZEOF_WCHAR_T);
1811
1812         if (ms_wcNeedsSwap)
1813         {
1814             // convert to native endianness
1815             for ( unsigned i = 0; i < res; i++ )
1816                 buf[n] = WC_BSWAP(buf[i]);
1817         }
1818
1819         // NUL-terminate the string if there is any space left
1820         if (res < n)
1821             buf[res] = 0;
1822     }
1823     else
1824     {
1825         // no destination buffer... convert using temp buffer
1826         // to calculate destination buffer requirement
1827         wchar_t tbuf[8];
1828         res = 0;
1829         do {
1830             bufPtr = tbuf;
1831             outbuf = 8*SIZEOF_WCHAR_T;
1832
1833             cres = iconv(m2w,
1834                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1835                          (char**)&bufPtr, &outbuf );
1836
1837             res += 8-(outbuf/SIZEOF_WCHAR_T);
1838         } while ((cres==(size_t)-1) && (errno==E2BIG));
1839     }
1840
1841     if (ICONV_FAILED(cres, inbuf))
1842     {
1843         //VS: it is ok if iconv fails, hence trace only
1844         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1845         return wxCONV_FAILED;
1846     }
1847
1848     return res;
1849 }
1850
1851 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1852 {
1853 #if wxUSE_THREADS
1854     // NB: explained in MB2WC
1855     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1856 #endif
1857
1858     size_t inlen = wxWcslen(psz);
1859     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1860     size_t outbuf = n;
1861     size_t res, cres;
1862
1863     wchar_t *tmpbuf = 0;
1864
1865     if (ms_wcNeedsSwap)
1866     {
1867         // need to copy to temp buffer to switch endianness
1868         // (doing WC_BSWAP twice on the original buffer won't help, as it
1869         //  could be in read-only memory, or be accessed in some other thread)
1870         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1871         for ( size_t i = 0; i < inlen; i++ )
1872             tmpbuf[n] = WC_BSWAP(psz[i]);
1873         tmpbuf[inlen] = L'\0';
1874         psz = tmpbuf;
1875     }
1876
1877     if (buf)
1878     {
1879         // have destination buffer, convert there
1880         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1881
1882         res = n-outbuf;
1883
1884         // NB: iconv was given only wcslen(psz) characters on input, and so
1885         //     it couldn't convert the trailing zero. Let's do it ourselves
1886         //     if there's some room left for it in the output buffer.
1887         if (res < n)
1888             buf[0] = 0;
1889     }
1890     else
1891     {
1892         // no destination buffer... convert using temp buffer
1893         // to calculate destination buffer requirement
1894         char tbuf[16];
1895         res = 0;
1896         do {
1897             buf = tbuf; outbuf = 16;
1898
1899             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1900
1901             res += 16 - outbuf;
1902         } while ((cres==(size_t)-1) && (errno==E2BIG));
1903     }
1904
1905     if (ms_wcNeedsSwap)
1906     {
1907         free(tmpbuf);
1908     }
1909
1910     if (ICONV_FAILED(cres, inbuf))
1911     {
1912         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1913         return wxCONV_FAILED;
1914     }
1915
1916     return res;
1917 }
1918
1919 size_t wxMBConv_iconv::GetMBNulLen() const
1920 {
1921     if ( m_minMBCharWidth == 0 )
1922     {
1923         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1924
1925 #if wxUSE_THREADS
1926         // NB: explained in MB2WC
1927         wxMutexLocker lock(self->m_iconvMutex);
1928 #endif
1929
1930         wchar_t *wnul = L"";
1931         char buf[8]; // should be enough for NUL in any encoding
1932         size_t inLen = sizeof(wchar_t),
1933                outLen = WXSIZEOF(buf);
1934         char *in = (char *)wnul;
1935         char *out = buf;
1936         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1937         {
1938             self->m_minMBCharWidth = (size_t)-1;
1939         }
1940         else // ok
1941         {
1942             self->m_minMBCharWidth = out - buf;
1943         }
1944     }
1945
1946     return m_minMBCharWidth;
1947 }
1948
1949 #endif // HAVE_ICONV
1950
1951
1952 // ============================================================================
1953 // Win32 conversion classes
1954 // ============================================================================
1955
1956 #ifdef wxHAVE_WIN32_MB2WC
1957
1958 // from utils.cpp
1959 #if wxUSE_FONTMAP
1960 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1961 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1962 #endif
1963
1964 class wxMBConv_win32 : public wxMBConv
1965 {
1966 public:
1967     wxMBConv_win32()
1968     {
1969         m_CodePage = CP_ACP;
1970         m_minMBCharWidth = 0;
1971     }
1972
1973     wxMBConv_win32(const wxMBConv_win32& conv)
1974     {
1975         m_CodePage = conv.m_CodePage;
1976         m_minMBCharWidth = conv.m_minMBCharWidth;
1977     }
1978
1979 #if wxUSE_FONTMAP
1980     wxMBConv_win32(const wxChar* name)
1981     {
1982         m_CodePage = wxCharsetToCodepage(name);
1983         m_minMBCharWidth = 0;
1984     }
1985
1986     wxMBConv_win32(wxFontEncoding encoding)
1987     {
1988         m_CodePage = wxEncodingToCodepage(encoding);
1989         m_minMBCharWidth = 0;
1990     }
1991 #endif // wxUSE_FONTMAP
1992
1993     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1994     {
1995         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1996         // the behaviour is not compatible with the Unix version (using iconv)
1997         // and break the library itself, e.g. wxTextInputStream::NextChar()
1998         // wouldn't work if reading an incomplete MB char didn't result in an
1999         // error
2000         //
2001         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2002         // Win XP or newer and it is not supported for UTF-[78] so we always
2003         // use our own conversions in this case. See
2004         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2005         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2006         if ( m_CodePage == CP_UTF8 )
2007         {
2008             return wxConvUTF8.MB2WC(buf, psz, n);
2009         }
2010
2011         if ( m_CodePage == CP_UTF7 )
2012         {
2013             return wxConvUTF7.MB2WC(buf, psz, n);
2014         }
2015
2016         int flags = 0;
2017         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2018                 IsAtLeastWin2kSP4() )
2019         {
2020             flags = MB_ERR_INVALID_CHARS;
2021         }
2022
2023         const size_t len = ::MultiByteToWideChar
2024                              (
2025                                 m_CodePage,     // code page
2026                                 flags,          // flags: fall on error
2027                                 psz,            // input string
2028                                 -1,             // its length (NUL-terminated)
2029                                 buf,            // output string
2030                                 buf ? n : 0     // size of output buffer
2031                              );
2032         if ( !len )
2033         {
2034             // function totally failed
2035             return wxCONV_FAILED;
2036         }
2037
2038         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2039         // check if we succeeded, by doing a double trip:
2040         if ( !flags && buf )
2041         {
2042             const size_t mbLen = strlen(psz);
2043             wxCharBuffer mbBuf(mbLen);
2044             if ( ::WideCharToMultiByte
2045                    (
2046                       m_CodePage,
2047                       0,
2048                       buf,
2049                       -1,
2050                       mbBuf.data(),
2051                       mbLen + 1,        // size in bytes, not length
2052                       NULL,
2053                       NULL
2054                    ) == 0 ||
2055                   strcmp(mbBuf, psz) != 0 )
2056             {
2057                 // we didn't obtain the same thing we started from, hence
2058                 // the conversion was lossy and we consider that it failed
2059                 return wxCONV_FAILED;
2060             }
2061         }
2062
2063         // note that it returns count of written chars for buf != NULL and size
2064         // of the needed buffer for buf == NULL so in either case the length of
2065         // the string (which never includes the terminating NUL) is one less
2066         return len - 1;
2067     }
2068
2069     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2070     {
2071         /*
2072             we have a problem here: by default, WideCharToMultiByte() may
2073             replace characters unrepresentable in the target code page with bad
2074             quality approximations such as turning "1/2" symbol (U+00BD) into
2075             "1" for the code pages which don't have it and we, obviously, want
2076             to avoid this at any price
2077
2078             the trouble is that this function does it _silently_, i.e. it won't
2079             even tell us whether it did or not... Win98/2000 and higher provide
2080             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2081             we have to resort to a round trip, i.e. check that converting back
2082             results in the same string -- this is, of course, expensive but
2083             otherwise we simply can't be sure to not garble the data.
2084          */
2085
2086         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2087         // it doesn't work with CJK encodings (which we test for rather roughly
2088         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2089         // supporting it
2090         BOOL usedDef wxDUMMY_INITIALIZE(false);
2091         BOOL *pUsedDef;
2092         int flags;
2093         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2094         {
2095             // it's our lucky day
2096             flags = WC_NO_BEST_FIT_CHARS;
2097             pUsedDef = &usedDef;
2098         }
2099         else // old system or unsupported encoding
2100         {
2101             flags = 0;
2102             pUsedDef = NULL;
2103         }
2104
2105         const size_t len = ::WideCharToMultiByte
2106                              (
2107                                 m_CodePage,     // code page
2108                                 flags,          // either none or no best fit
2109                                 pwz,            // input string
2110                                 -1,             // it is (wide) NUL-terminated
2111                                 buf,            // output buffer
2112                                 buf ? n : 0,    // and its size
2113                                 NULL,           // default "replacement" char
2114                                 pUsedDef        // [out] was it used?
2115                              );
2116
2117         if ( !len )
2118         {
2119             // function totally failed
2120             return wxCONV_FAILED;
2121         }
2122
2123         // if we were really converting, check if we succeeded
2124         if ( buf )
2125         {
2126             if ( flags )
2127             {
2128                 // check if the conversion failed, i.e. if any replacements
2129                 // were done
2130                 if ( usedDef )
2131                     return wxCONV_FAILED;
2132             }
2133             else // we must resort to double tripping...
2134             {
2135                 wxWCharBuffer wcBuf(n);
2136                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2137                         wcscmp(wcBuf, pwz) != 0 )
2138                 {
2139                     // we didn't obtain the same thing we started from, hence
2140                     // the conversion was lossy and we consider that it failed
2141                     return wxCONV_FAILED;
2142                 }
2143             }
2144         }
2145
2146         // see the comment above for the reason of "len - 1"
2147         return len - 1;
2148     }
2149
2150     virtual size_t GetMBNulLen() const
2151     {
2152         if ( m_minMBCharWidth == 0 )
2153         {
2154             int len = ::WideCharToMultiByte
2155                         (
2156                             m_CodePage,     // code page
2157                             0,              // no flags
2158                             L"",            // input string
2159                             1,              // translate just the NUL
2160                             NULL,           // output buffer
2161                             0,              // and its size
2162                             NULL,           // no replacement char
2163                             NULL            // [out] don't care if it was used
2164                         );
2165
2166             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2167             switch ( len )
2168             {
2169                 default:
2170                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2171                     // fall through
2172
2173                 case 0:
2174                     self->m_minMBCharWidth = (size_t)-1;
2175                     break;
2176
2177                 case 1:
2178                 case 2:
2179                 case 4:
2180                     self->m_minMBCharWidth = len;
2181                     break;
2182             }
2183         }
2184
2185         return m_minMBCharWidth;
2186     }
2187
2188     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2189
2190     bool IsOk() const { return m_CodePage != -1; }
2191
2192 private:
2193     static bool CanUseNoBestFit()
2194     {
2195         static int s_isWin98Or2k = -1;
2196
2197         if ( s_isWin98Or2k == -1 )
2198         {
2199             int verMaj, verMin;
2200             switch ( wxGetOsVersion(&verMaj, &verMin) )
2201             {
2202                 case wxWIN95:
2203                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2204                     break;
2205
2206                 case wxWINDOWS_NT:
2207                     s_isWin98Or2k = verMaj >= 5;
2208                     break;
2209
2210                 default:
2211                     // unknown, be conseravtive by default
2212                     s_isWin98Or2k = 0;
2213             }
2214
2215             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2216         }
2217
2218         return s_isWin98Or2k == 1;
2219     }
2220
2221     static bool IsAtLeastWin2kSP4()
2222     {
2223 #ifdef __WXWINCE__
2224         return false;
2225 #else
2226         static int s_isAtLeastWin2kSP4 = -1;
2227
2228         if ( s_isAtLeastWin2kSP4 == -1 )
2229         {
2230             OSVERSIONINFOEX ver;
2231
2232             memset(&ver, 0, sizeof(ver));
2233             ver.dwOSVersionInfoSize = sizeof(ver);
2234             GetVersionEx((OSVERSIONINFO*)&ver);
2235
2236             s_isAtLeastWin2kSP4 =
2237               ((ver.dwMajorVersion > 5) || // Vista+
2238                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2239                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2240                ver.wServicePackMajor >= 4)) // 2000 SP4+
2241               ? 1 : 0;
2242         }
2243
2244         return s_isAtLeastWin2kSP4 == 1;
2245 #endif
2246     }
2247
2248
2249     // the code page we're working with
2250     long m_CodePage;
2251
2252     // cached result of GetMBNulLen(), set to 0 initially meaning
2253     // "unknown"
2254     size_t m_minMBCharWidth;
2255 };
2256
2257 #endif // wxHAVE_WIN32_MB2WC
2258
2259 // ============================================================================
2260 // Cocoa conversion classes
2261 // ============================================================================
2262
2263 #if defined(__WXCOCOA__)
2264
2265 // RN:  There is no UTF-32 support in either Core Foundation or
2266 // Cocoa.  Strangely enough, internally Core Foundation uses
2267 // UTF 32 internally quite a bit - its just not public (yet).
2268
2269 #include <CoreFoundation/CFString.h>
2270 #include <CoreFoundation/CFStringEncodingExt.h>
2271
2272 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2273 {
2274     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2275     if ( encoding == wxFONTENCODING_DEFAULT )
2276     {
2277         enc = CFStringGetSystemEncoding();
2278     }
2279     else switch( encoding)
2280     {
2281         case wxFONTENCODING_ISO8859_1 :
2282             enc = kCFStringEncodingISOLatin1 ;
2283             break ;
2284         case wxFONTENCODING_ISO8859_2 :
2285             enc = kCFStringEncodingISOLatin2;
2286             break ;
2287         case wxFONTENCODING_ISO8859_3 :
2288             enc = kCFStringEncodingISOLatin3 ;
2289             break ;
2290         case wxFONTENCODING_ISO8859_4 :
2291             enc = kCFStringEncodingISOLatin4;
2292             break ;
2293         case wxFONTENCODING_ISO8859_5 :
2294             enc = kCFStringEncodingISOLatinCyrillic;
2295             break ;
2296         case wxFONTENCODING_ISO8859_6 :
2297             enc = kCFStringEncodingISOLatinArabic;
2298             break ;
2299         case wxFONTENCODING_ISO8859_7 :
2300             enc = kCFStringEncodingISOLatinGreek;
2301             break ;
2302         case wxFONTENCODING_ISO8859_8 :
2303             enc = kCFStringEncodingISOLatinHebrew;
2304             break ;
2305         case wxFONTENCODING_ISO8859_9 :
2306             enc = kCFStringEncodingISOLatin5;
2307             break ;
2308         case wxFONTENCODING_ISO8859_10 :
2309             enc = kCFStringEncodingISOLatin6;
2310             break ;
2311         case wxFONTENCODING_ISO8859_11 :
2312             enc = kCFStringEncodingISOLatinThai;
2313             break ;
2314         case wxFONTENCODING_ISO8859_13 :
2315             enc = kCFStringEncodingISOLatin7;
2316             break ;
2317         case wxFONTENCODING_ISO8859_14 :
2318             enc = kCFStringEncodingISOLatin8;
2319             break ;
2320         case wxFONTENCODING_ISO8859_15 :
2321             enc = kCFStringEncodingISOLatin9;
2322             break ;
2323
2324         case wxFONTENCODING_KOI8 :
2325             enc = kCFStringEncodingKOI8_R;
2326             break ;
2327         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2328             enc = kCFStringEncodingDOSRussian;
2329             break ;
2330
2331 //      case wxFONTENCODING_BULGARIAN :
2332 //          enc = ;
2333 //          break ;
2334
2335         case wxFONTENCODING_CP437 :
2336             enc =kCFStringEncodingDOSLatinUS ;
2337             break ;
2338         case wxFONTENCODING_CP850 :
2339             enc = kCFStringEncodingDOSLatin1;
2340             break ;
2341         case wxFONTENCODING_CP852 :
2342             enc = kCFStringEncodingDOSLatin2;
2343             break ;
2344         case wxFONTENCODING_CP855 :
2345             enc = kCFStringEncodingDOSCyrillic;
2346             break ;
2347         case wxFONTENCODING_CP866 :
2348             enc =kCFStringEncodingDOSRussian ;
2349             break ;
2350         case wxFONTENCODING_CP874 :
2351             enc = kCFStringEncodingDOSThai;
2352             break ;
2353         case wxFONTENCODING_CP932 :
2354             enc = kCFStringEncodingDOSJapanese;
2355             break ;
2356         case wxFONTENCODING_CP936 :
2357             enc =kCFStringEncodingDOSChineseSimplif ;
2358             break ;
2359         case wxFONTENCODING_CP949 :
2360             enc = kCFStringEncodingDOSKorean;
2361             break ;
2362         case wxFONTENCODING_CP950 :
2363             enc = kCFStringEncodingDOSChineseTrad;
2364             break ;
2365         case wxFONTENCODING_CP1250 :
2366             enc = kCFStringEncodingWindowsLatin2;
2367             break ;
2368         case wxFONTENCODING_CP1251 :
2369             enc =kCFStringEncodingWindowsCyrillic ;
2370             break ;
2371         case wxFONTENCODING_CP1252 :
2372             enc =kCFStringEncodingWindowsLatin1 ;
2373             break ;
2374         case wxFONTENCODING_CP1253 :
2375             enc = kCFStringEncodingWindowsGreek;
2376             break ;
2377         case wxFONTENCODING_CP1254 :
2378             enc = kCFStringEncodingWindowsLatin5;
2379             break ;
2380         case wxFONTENCODING_CP1255 :
2381             enc =kCFStringEncodingWindowsHebrew ;
2382             break ;
2383         case wxFONTENCODING_CP1256 :
2384             enc =kCFStringEncodingWindowsArabic ;
2385             break ;
2386         case wxFONTENCODING_CP1257 :
2387             enc = kCFStringEncodingWindowsBalticRim;
2388             break ;
2389 //   This only really encodes to UTF7 (if that) evidently
2390 //        case wxFONTENCODING_UTF7 :
2391 //            enc = kCFStringEncodingNonLossyASCII ;
2392 //            break ;
2393         case wxFONTENCODING_UTF8 :
2394             enc = kCFStringEncodingUTF8 ;
2395             break ;
2396         case wxFONTENCODING_EUC_JP :
2397             enc = kCFStringEncodingEUC_JP;
2398             break ;
2399         case wxFONTENCODING_UTF16 :
2400             enc = kCFStringEncodingUnicode ;
2401             break ;
2402         case wxFONTENCODING_MACROMAN :
2403             enc = kCFStringEncodingMacRoman ;
2404             break ;
2405         case wxFONTENCODING_MACJAPANESE :
2406             enc = kCFStringEncodingMacJapanese ;
2407             break ;
2408         case wxFONTENCODING_MACCHINESETRAD :
2409             enc = kCFStringEncodingMacChineseTrad ;
2410             break ;
2411         case wxFONTENCODING_MACKOREAN :
2412             enc = kCFStringEncodingMacKorean ;
2413             break ;
2414         case wxFONTENCODING_MACARABIC :
2415             enc = kCFStringEncodingMacArabic ;
2416             break ;
2417         case wxFONTENCODING_MACHEBREW :
2418             enc = kCFStringEncodingMacHebrew ;
2419             break ;
2420         case wxFONTENCODING_MACGREEK :
2421             enc = kCFStringEncodingMacGreek ;
2422             break ;
2423         case wxFONTENCODING_MACCYRILLIC :
2424             enc = kCFStringEncodingMacCyrillic ;
2425             break ;
2426         case wxFONTENCODING_MACDEVANAGARI :
2427             enc = kCFStringEncodingMacDevanagari ;
2428             break ;
2429         case wxFONTENCODING_MACGURMUKHI :
2430             enc = kCFStringEncodingMacGurmukhi ;
2431             break ;
2432         case wxFONTENCODING_MACGUJARATI :
2433             enc = kCFStringEncodingMacGujarati ;
2434             break ;
2435         case wxFONTENCODING_MACORIYA :
2436             enc = kCFStringEncodingMacOriya ;
2437             break ;
2438         case wxFONTENCODING_MACBENGALI :
2439             enc = kCFStringEncodingMacBengali ;
2440             break ;
2441         case wxFONTENCODING_MACTAMIL :
2442             enc = kCFStringEncodingMacTamil ;
2443             break ;
2444         case wxFONTENCODING_MACTELUGU :
2445             enc = kCFStringEncodingMacTelugu ;
2446             break ;
2447         case wxFONTENCODING_MACKANNADA :
2448             enc = kCFStringEncodingMacKannada ;
2449             break ;
2450         case wxFONTENCODING_MACMALAJALAM :
2451             enc = kCFStringEncodingMacMalayalam ;
2452             break ;
2453         case wxFONTENCODING_MACSINHALESE :
2454             enc = kCFStringEncodingMacSinhalese ;
2455             break ;
2456         case wxFONTENCODING_MACBURMESE :
2457             enc = kCFStringEncodingMacBurmese ;
2458             break ;
2459         case wxFONTENCODING_MACKHMER :
2460             enc = kCFStringEncodingMacKhmer ;
2461             break ;
2462         case wxFONTENCODING_MACTHAI :
2463             enc = kCFStringEncodingMacThai ;
2464             break ;
2465         case wxFONTENCODING_MACLAOTIAN :
2466             enc = kCFStringEncodingMacLaotian ;
2467             break ;
2468         case wxFONTENCODING_MACGEORGIAN :
2469             enc = kCFStringEncodingMacGeorgian ;
2470             break ;
2471         case wxFONTENCODING_MACARMENIAN :
2472             enc = kCFStringEncodingMacArmenian ;
2473             break ;
2474         case wxFONTENCODING_MACCHINESESIMP :
2475             enc = kCFStringEncodingMacChineseSimp ;
2476             break ;
2477         case wxFONTENCODING_MACTIBETAN :
2478             enc = kCFStringEncodingMacTibetan ;
2479             break ;
2480         case wxFONTENCODING_MACMONGOLIAN :
2481             enc = kCFStringEncodingMacMongolian ;
2482             break ;
2483         case wxFONTENCODING_MACETHIOPIC :
2484             enc = kCFStringEncodingMacEthiopic ;
2485             break ;
2486         case wxFONTENCODING_MACCENTRALEUR :
2487             enc = kCFStringEncodingMacCentralEurRoman ;
2488             break ;
2489         case wxFONTENCODING_MACVIATNAMESE :
2490             enc = kCFStringEncodingMacVietnamese ;
2491             break ;
2492         case wxFONTENCODING_MACARABICEXT :
2493             enc = kCFStringEncodingMacExtArabic ;
2494             break ;
2495         case wxFONTENCODING_MACSYMBOL :
2496             enc = kCFStringEncodingMacSymbol ;
2497             break ;
2498         case wxFONTENCODING_MACDINGBATS :
2499             enc = kCFStringEncodingMacDingbats ;
2500             break ;
2501         case wxFONTENCODING_MACTURKISH :
2502             enc = kCFStringEncodingMacTurkish ;
2503             break ;
2504         case wxFONTENCODING_MACCROATIAN :
2505             enc = kCFStringEncodingMacCroatian ;
2506             break ;
2507         case wxFONTENCODING_MACICELANDIC :
2508             enc = kCFStringEncodingMacIcelandic ;
2509             break ;
2510         case wxFONTENCODING_MACROMANIAN :
2511             enc = kCFStringEncodingMacRomanian ;
2512             break ;
2513         case wxFONTENCODING_MACCELTIC :
2514             enc = kCFStringEncodingMacCeltic ;
2515             break ;
2516         case wxFONTENCODING_MACGAELIC :
2517             enc = kCFStringEncodingMacGaelic ;
2518             break ;
2519 //      case wxFONTENCODING_MACKEYBOARD :
2520 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2521 //          break ;
2522         default :
2523             // because gcc is picky
2524             break ;
2525     } ;
2526     return enc ;
2527 }
2528
2529 class wxMBConv_cocoa : public wxMBConv
2530 {
2531 public:
2532     wxMBConv_cocoa()
2533     {
2534         Init(CFStringGetSystemEncoding()) ;
2535     }
2536
2537     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2538     {
2539         m_encoding = conv.m_encoding;
2540     }
2541
2542 #if wxUSE_FONTMAP
2543     wxMBConv_cocoa(const wxChar* name)
2544     {
2545         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2546     }
2547 #endif
2548
2549     wxMBConv_cocoa(wxFontEncoding encoding)
2550     {
2551         Init( wxCFStringEncFromFontEnc(encoding) );
2552     }
2553
2554     ~wxMBConv_cocoa()
2555     {
2556     }
2557
2558     void Init( CFStringEncoding encoding)
2559     {
2560         m_encoding = encoding ;
2561     }
2562
2563     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2564     {
2565         wxASSERT(szUnConv);
2566
2567         CFStringRef theString = CFStringCreateWithBytes (
2568                                                 NULL, //the allocator
2569                                                 (const UInt8*)szUnConv,
2570                                                 strlen(szUnConv),
2571                                                 m_encoding,
2572                                                 false //no BOM/external representation
2573                                                 );
2574
2575         wxASSERT(theString);
2576
2577         size_t nOutLength = CFStringGetLength(theString);
2578
2579         if (szOut == NULL)
2580         {
2581             CFRelease(theString);
2582             return nOutLength;
2583         }
2584
2585         CFRange theRange = { 0, nOutSize };
2586
2587 #if SIZEOF_WCHAR_T == 4
2588         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2589 #endif
2590
2591         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2592
2593         CFRelease(theString);
2594
2595         szUniCharBuffer[nOutLength] = '\0' ;
2596
2597 #if SIZEOF_WCHAR_T == 4
2598         wxMBConvUTF16 converter ;
2599         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2600         delete[] szUniCharBuffer;
2601 #endif
2602
2603         return nOutLength;
2604     }
2605
2606     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2607     {
2608         wxASSERT(szUnConv);
2609
2610         size_t nRealOutSize;
2611         size_t nBufSize = wxWcslen(szUnConv);
2612         UniChar* szUniBuffer = (UniChar*) szUnConv;
2613
2614 #if SIZEOF_WCHAR_T == 4
2615         wxMBConvUTF16 converter ;
2616         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2617         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2618         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2619         nBufSize /= sizeof(UniChar);
2620 #endif
2621
2622         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2623                                 NULL, //allocator
2624                                 szUniBuffer,
2625                                 nBufSize,
2626                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2627                             );
2628
2629         wxASSERT(theString);
2630
2631         //Note that CER puts a BOM when converting to unicode
2632         //so we  check and use getchars instead in that case
2633         if (m_encoding == kCFStringEncodingUnicode)
2634         {
2635             if (szOut != NULL)
2636                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2637
2638             nRealOutSize = CFStringGetLength(theString) + 1;
2639         }
2640         else
2641         {
2642             CFStringGetBytes(
2643                 theString,
2644                 CFRangeMake(0, CFStringGetLength(theString)),
2645                 m_encoding,
2646                 0, //what to put in characters that can't be converted -
2647                     //0 tells CFString to return NULL if it meets such a character
2648                 false, //not an external representation
2649                 (UInt8*) szOut,
2650                 nOutSize,
2651                 (CFIndex*) &nRealOutSize
2652                         );
2653         }
2654
2655         CFRelease(theString);
2656
2657 #if SIZEOF_WCHAR_T == 4
2658         delete[] szUniBuffer;
2659 #endif
2660
2661         return  nRealOutSize - 1;
2662     }
2663
2664     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2665
2666     bool IsOk() const
2667     {
2668         return m_encoding != kCFStringEncodingInvalidId &&
2669               CFStringIsEncodingAvailable(m_encoding);
2670     }
2671
2672 private:
2673     CFStringEncoding m_encoding ;
2674 };
2675
2676 #endif // defined(__WXCOCOA__)
2677
2678 // ============================================================================
2679 // Mac conversion classes
2680 // ============================================================================
2681
2682 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2683
2684 class wxMBConv_mac : public wxMBConv
2685 {
2686 public:
2687     wxMBConv_mac()
2688     {
2689         Init(CFStringGetSystemEncoding()) ;
2690     }
2691
2692     wxMBConv_mac(const wxMBConv_mac& conv)
2693     {
2694         Init(conv.m_char_encoding);
2695     }
2696
2697 #if wxUSE_FONTMAP
2698     wxMBConv_mac(const wxChar* name)
2699     {
2700         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2701     }
2702 #endif
2703
2704     wxMBConv_mac(wxFontEncoding encoding)
2705     {
2706         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2707     }
2708
2709     ~wxMBConv_mac()
2710     {
2711         OSStatus status = noErr ;
2712         status = TECDisposeConverter(m_MB2WC_converter);
2713         status = TECDisposeConverter(m_WC2MB_converter);
2714     }
2715
2716
2717     void Init( TextEncodingBase encoding)
2718     {
2719         OSStatus status = noErr ;
2720         m_char_encoding = encoding ;
2721         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2722
2723         status = TECCreateConverter(&m_MB2WC_converter,
2724                                     m_char_encoding,
2725                                     m_unicode_encoding);
2726         status = TECCreateConverter(&m_WC2MB_converter,
2727                                     m_unicode_encoding,
2728                                     m_char_encoding);
2729     }
2730
2731     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2732     {
2733         OSStatus status = noErr ;
2734         ByteCount byteOutLen ;
2735         ByteCount byteInLen = strlen(psz) ;
2736         wchar_t *tbuf = NULL ;
2737         UniChar* ubuf = NULL ;
2738         size_t res = 0 ;
2739
2740         if (buf == NULL)
2741         {
2742             //apple specs say at least 32
2743             n = wxMax( 32 , byteInLen ) ;
2744             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2745         }
2746         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2747 #if SIZEOF_WCHAR_T == 4
2748         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2749 #else
2750         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2751 #endif
2752         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2753           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2754 #if SIZEOF_WCHAR_T == 4
2755         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2756         // is not properly terminated we get random characters at the end
2757         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2758         wxMBConvUTF16 converter ;
2759         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2760         free( ubuf ) ;
2761 #else
2762         res = byteOutLen / sizeof( UniChar ) ;
2763 #endif
2764         if ( buf == NULL )
2765              free(tbuf) ;
2766
2767         if ( buf  && res < n)
2768             buf[res] = 0;
2769
2770         return res ;
2771     }
2772
2773     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2774     {
2775         OSStatus status = noErr ;
2776         ByteCount byteOutLen ;
2777         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2778
2779         char *tbuf = NULL ;
2780
2781         if (buf == NULL)
2782         {
2783             //apple specs say at least 32
2784             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2785             tbuf = (char*) malloc( n ) ;
2786         }
2787
2788         ByteCount byteBufferLen = n ;
2789         UniChar* ubuf = NULL ;
2790 #if SIZEOF_WCHAR_T == 4
2791         wxMBConvUTF16 converter ;
2792         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2793         byteInLen = unicharlen ;
2794         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2795         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2796 #else
2797         ubuf = (UniChar*) psz ;
2798 #endif
2799         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2800             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2801 #if SIZEOF_WCHAR_T == 4
2802         free( ubuf ) ;
2803 #endif
2804         if ( buf == NULL )
2805             free(tbuf) ;
2806
2807         size_t res = byteOutLen ;
2808         if ( buf  && res < n)
2809         {
2810             buf[res] = 0;
2811
2812             //we need to double-trip to verify it didn't insert any ? in place
2813             //of bogus characters
2814             wxWCharBuffer wcBuf(n);
2815             size_t pszlen = wxWcslen(psz);
2816             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2817                         wxWcslen(wcBuf) != pszlen ||
2818                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2819             {
2820                 // we didn't obtain the same thing we started from, hence
2821                 // the conversion was lossy and we consider that it failed
2822                 return wxCONV_FAILED;
2823             }
2824         }
2825
2826         return res ;
2827     }
2828
2829     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2830
2831     bool IsOk() const
2832         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2833
2834 private:
2835     TECObjectRef m_MB2WC_converter ;
2836     TECObjectRef m_WC2MB_converter ;
2837
2838     TextEncodingBase m_char_encoding ;
2839     TextEncodingBase m_unicode_encoding ;
2840 };
2841
2842 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2843
2844 // ============================================================================
2845 // wxEncodingConverter based conversion classes
2846 // ============================================================================
2847
2848 #if wxUSE_FONTMAP
2849
2850 class wxMBConv_wxwin : public wxMBConv
2851 {
2852 private:
2853     void Init()
2854     {
2855         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2856                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2857     }
2858
2859 public:
2860     // temporarily just use wxEncodingConverter stuff,
2861     // so that it works while a better implementation is built
2862     wxMBConv_wxwin(const wxChar* name)
2863     {
2864         if (name)
2865             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2866         else
2867             m_enc = wxFONTENCODING_SYSTEM;
2868
2869         Init();
2870     }
2871
2872     wxMBConv_wxwin(wxFontEncoding enc)
2873     {
2874         m_enc = enc;
2875
2876         Init();
2877     }
2878
2879     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2880     {
2881         size_t inbuf = strlen(psz);
2882         if (buf)
2883         {
2884             if (!m2w.Convert(psz,buf))
2885                 return wxCONV_FAILED;
2886         }
2887         return inbuf;
2888     }
2889
2890     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2891     {
2892         const size_t inbuf = wxWcslen(psz);
2893         if (buf)
2894         {
2895             if (!w2m.Convert(psz,buf))
2896                 return wxCONV_FAILED;
2897         }
2898
2899         return inbuf;
2900     }
2901
2902     virtual size_t GetMBNulLen() const
2903     {
2904         switch ( m_enc )
2905         {
2906             case wxFONTENCODING_UTF16BE:
2907             case wxFONTENCODING_UTF16LE:
2908                 return 2;
2909
2910             case wxFONTENCODING_UTF32BE:
2911             case wxFONTENCODING_UTF32LE:
2912                 return 4;
2913
2914             default:
2915                 return 1;
2916         }
2917     }
2918
2919     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2920
2921     bool IsOk() const { return m_ok; }
2922
2923 public:
2924     wxFontEncoding m_enc;
2925     wxEncodingConverter m2w, w2m;
2926
2927 private:
2928     // were we initialized successfully?
2929     bool m_ok;
2930
2931     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2932 };
2933
2934 // make the constructors available for unit testing
2935 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2936 {
2937     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2938     if ( !result->IsOk() )
2939     {
2940         delete result;
2941         return 0;
2942     }
2943     return result;
2944 }
2945
2946 #endif // wxUSE_FONTMAP
2947
2948 // ============================================================================
2949 // wxCSConv implementation
2950 // ============================================================================
2951
2952 void wxCSConv::Init()
2953 {
2954     m_name = NULL;
2955     m_convReal =  NULL;
2956     m_deferred = true;
2957 }
2958
2959 wxCSConv::wxCSConv(const wxChar *charset)
2960 {
2961     Init();
2962
2963     if ( charset )
2964     {
2965         SetName(charset);
2966     }
2967
2968 #if wxUSE_FONTMAP
2969     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2970 #else
2971     m_encoding = wxFONTENCODING_SYSTEM;
2972 #endif
2973 }
2974
2975 wxCSConv::wxCSConv(wxFontEncoding encoding)
2976 {
2977     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2978     {
2979         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2980
2981         encoding = wxFONTENCODING_SYSTEM;
2982     }
2983
2984     Init();
2985
2986     m_encoding = encoding;
2987 }
2988
2989 wxCSConv::~wxCSConv()
2990 {
2991     Clear();
2992 }
2993
2994 wxCSConv::wxCSConv(const wxCSConv& conv)
2995         : wxMBConv()
2996 {
2997     Init();
2998
2999     SetName(conv.m_name);
3000     m_encoding = conv.m_encoding;
3001 }
3002
3003 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3004 {
3005     Clear();
3006
3007     SetName(conv.m_name);
3008     m_encoding = conv.m_encoding;
3009
3010     return *this;
3011 }
3012
3013 void wxCSConv::Clear()
3014 {
3015     free(m_name);
3016     delete m_convReal;
3017
3018     m_name = NULL;
3019     m_convReal = NULL;
3020 }
3021
3022 void wxCSConv::SetName(const wxChar *charset)
3023 {
3024     if (charset)
3025     {
3026         m_name = wxStrdup(charset);
3027         m_deferred = true;
3028     }
3029 }
3030
3031 #if wxUSE_FONTMAP
3032 #include "wx/hashmap.h"
3033
3034 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3035                      wxEncodingNameCache );
3036
3037 static wxEncodingNameCache gs_nameCache;
3038 #endif
3039
3040 wxMBConv *wxCSConv::DoCreate() const
3041 {
3042 #if wxUSE_FONTMAP
3043     wxLogTrace(TRACE_STRCONV,
3044                wxT("creating conversion for %s"),
3045                (m_name ? m_name
3046                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3047 #endif // wxUSE_FONTMAP
3048
3049     // check for the special case of ASCII or ISO8859-1 charset: as we have
3050     // special knowledge of it anyhow, we don't need to create a special
3051     // conversion object
3052     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3053             m_encoding == wxFONTENCODING_DEFAULT )
3054     {
3055         // don't convert at all
3056         return NULL;
3057     }
3058
3059     // we trust OS to do conversion better than we can so try external
3060     // conversion methods first
3061     //
3062     // the full order is:
3063     //      1. OS conversion (iconv() under Unix or Win32 API)
3064     //      2. hard coded conversions for UTF
3065     //      3. wxEncodingConverter as fall back
3066
3067     // step (1)
3068 #ifdef HAVE_ICONV
3069 #if !wxUSE_FONTMAP
3070     if ( m_name )
3071 #endif // !wxUSE_FONTMAP
3072     {
3073         wxString name(m_name);
3074         wxFontEncoding encoding(m_encoding);
3075
3076         if ( !name.empty() )
3077         {
3078             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3079             if ( conv->IsOk() )
3080                 return conv;
3081
3082             delete conv;
3083
3084 #if wxUSE_FONTMAP
3085             encoding =
3086                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3087 #endif // wxUSE_FONTMAP
3088         }
3089 #if wxUSE_FONTMAP
3090         {
3091             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3092             if ( it != gs_nameCache.end() )
3093             {
3094                 if ( it->second.empty() )
3095                     return NULL;
3096
3097                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3098                 if ( conv->IsOk() )
3099                     return conv;
3100
3101                 delete conv;
3102             }
3103
3104             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3105
3106             for ( ; *names; ++names )
3107             {
3108                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3109                 if ( conv->IsOk() )
3110                 {
3111                     gs_nameCache[encoding] = *names;
3112                     return conv;
3113                 }
3114
3115                 delete conv;
3116             }
3117
3118             gs_nameCache[encoding] = _T(""); // cache the failure
3119         }
3120 #endif // wxUSE_FONTMAP
3121     }
3122 #endif // HAVE_ICONV
3123
3124 #ifdef wxHAVE_WIN32_MB2WC
3125     {
3126 #if wxUSE_FONTMAP
3127         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3128                                       : new wxMBConv_win32(m_encoding);
3129         if ( conv->IsOk() )
3130             return conv;
3131
3132         delete conv;
3133 #else
3134         return NULL;
3135 #endif
3136     }
3137 #endif // wxHAVE_WIN32_MB2WC
3138 #if defined(__WXMAC__)
3139     {
3140         // leave UTF16 and UTF32 to the built-ins of wx
3141         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3142             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3143         {
3144
3145 #if wxUSE_FONTMAP
3146             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3147                                         : new wxMBConv_mac(m_encoding);
3148 #else
3149             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3150 #endif
3151             if ( conv->IsOk() )
3152                  return conv;
3153
3154             delete conv;
3155         }
3156     }
3157 #endif
3158 #if defined(__WXCOCOA__)
3159     {
3160         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3161         {
3162
3163 #if wxUSE_FONTMAP
3164             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3165                                           : new wxMBConv_cocoa(m_encoding);
3166 #else
3167             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3168 #endif
3169             if ( conv->IsOk() )
3170                  return conv;
3171
3172             delete conv;
3173         }
3174     }
3175 #endif
3176     // step (2)
3177     wxFontEncoding enc = m_encoding;
3178 #if wxUSE_FONTMAP
3179     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3180     {
3181         // use "false" to suppress interactive dialogs -- we can be called from
3182         // anywhere and popping up a dialog from here is the last thing we want to
3183         // do
3184         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3185     }
3186 #endif // wxUSE_FONTMAP
3187
3188     switch ( enc )
3189     {
3190         case wxFONTENCODING_UTF7:
3191              return new wxMBConvUTF7;
3192
3193         case wxFONTENCODING_UTF8:
3194              return new wxMBConvUTF8;
3195
3196         case wxFONTENCODING_UTF16BE:
3197              return new wxMBConvUTF16BE;
3198
3199         case wxFONTENCODING_UTF16LE:
3200              return new wxMBConvUTF16LE;
3201
3202         case wxFONTENCODING_UTF32BE:
3203              return new wxMBConvUTF32BE;
3204
3205         case wxFONTENCODING_UTF32LE:
3206              return new wxMBConvUTF32LE;
3207
3208         default:
3209              // nothing to do but put here to suppress gcc warnings
3210              ;
3211     }
3212
3213     // step (3)
3214 #if wxUSE_FONTMAP
3215     {
3216         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3217                                       : new wxMBConv_wxwin(m_encoding);
3218         if ( conv->IsOk() )
3219             return conv;
3220
3221         delete conv;
3222     }
3223 #endif // wxUSE_FONTMAP
3224
3225     // NB: This is a hack to prevent deadlock. What could otherwise happen
3226     //     in Unicode build: wxConvLocal creation ends up being here
3227     //     because of some failure and logs the error. But wxLog will try to
3228     //     attach timestamp, for which it will need wxConvLocal (to convert
3229     //     time to char* and then wchar_t*), but that fails, tries to log
3230     //     error, but wxLog has a (already locked) critical section that
3231     //     guards static buffer.
3232     static bool alreadyLoggingError = false;
3233     if (!alreadyLoggingError)
3234     {
3235         alreadyLoggingError = true;
3236         wxLogError(_("Cannot convert from the charset '%s'!"),
3237                    m_name ? m_name
3238                       :
3239 #if wxUSE_FONTMAP
3240                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3241 #else // !wxUSE_FONTMAP
3242                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3243 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3244               );
3245         alreadyLoggingError = false;
3246     }
3247
3248     return NULL;
3249 }
3250
3251 void wxCSConv::CreateConvIfNeeded() const
3252 {
3253     if ( m_deferred )
3254     {
3255         wxCSConv *self = (wxCSConv *)this; // const_cast
3256
3257 #if wxUSE_INTL
3258         // if we don't have neither the name nor the encoding, use the default
3259         // encoding for this system
3260         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3261         {
3262             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3263         }
3264 #endif // wxUSE_INTL
3265
3266         self->m_convReal = DoCreate();
3267         self->m_deferred = false;
3268     }
3269 }
3270
3271 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3272 {
3273     CreateConvIfNeeded();
3274
3275     if (m_convReal)
3276         return m_convReal->MB2WC(buf, psz, n);
3277
3278     // latin-1 (direct)
3279     size_t len = strlen(psz);
3280
3281     if (buf)
3282     {
3283         for (size_t c = 0; c <= len; c++)
3284             buf[c] = (unsigned char)(psz[c]);
3285     }
3286
3287     return len;
3288 }
3289
3290 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3291 {
3292     CreateConvIfNeeded();
3293
3294     if (m_convReal)
3295         return m_convReal->WC2MB(buf, psz, n);
3296
3297     // latin-1 (direct)
3298     const size_t len = wxWcslen(psz);
3299     if (buf)
3300     {
3301         for (size_t c = 0; c <= len; c++)
3302         {
3303             if (psz[c] > 0xFF)
3304                 return wxCONV_FAILED;
3305             buf[c] = (char)psz[c];
3306         }
3307     }
3308     else
3309     {
3310         for (size_t c = 0; c <= len; c++)
3311         {
3312             if (psz[c] > 0xFF)
3313                 return wxCONV_FAILED;
3314         }
3315     }
3316
3317     return len;
3318 }
3319
3320 size_t wxCSConv::GetMBNulLen() const
3321 {
3322     CreateConvIfNeeded();
3323
3324     if ( m_convReal )
3325     {
3326         return m_convReal->GetMBNulLen();
3327     }
3328
3329     return 1;
3330 }
3331
3332 // ----------------------------------------------------------------------------
3333 // globals
3334 // ----------------------------------------------------------------------------
3335
3336 #ifdef __WINDOWS__
3337     static wxMBConv_win32 wxConvLibcObj;
3338 #elif defined(__WXMAC__) && !defined(__MACH__)
3339     static wxMBConv_mac wxConvLibcObj ;
3340 #else
3341     static wxMBConvLibc wxConvLibcObj;
3342 #endif
3343
3344 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3345 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3346 static wxMBConvUTF7 wxConvUTF7Obj;
3347 static wxMBConvUTF8 wxConvUTF8Obj;
3348
3349 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3350 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3351 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3352 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3353 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3354 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3355 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3356 #ifdef __WXOSX__
3357                                     wxConvUTF8Obj;
3358 #else
3359                                     wxConvLibcObj;
3360 #endif
3361
3362
3363 #else // !wxUSE_WCHAR_T
3364
3365 // stand-ins in absence of wchar_t
3366 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3367                                 wxConvISO8859_1,
3368                                 wxConvLocal,
3369                                 wxConvUTF8;
3370
3371 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T