src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/intl.h"
  20     #include "wx/log.h"
  21 #endif
  22
  23 #include "wx/strconv.h"
  24
  25 #if wxUSE_WCHAR_T
  26
  27 #ifdef __WINDOWS__
  28     #include "wx/msw/private.h"
  29     #include "wx/msw/missing.h"
  30 #endif
  31
  32 #ifndef __WXWINCE__
  33 #include <errno.h>
  34 #endif
  35
  36 #include <ctype.h>
  37 #include <string.h>
  38 #include <stdlib.h>
  39
  40 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  41     #define wxHAVE_WIN32_MB2WC
  42 #endif
  43
  44 #ifdef __SALFORDC__
  45     #include <clib.h>
  46 #endif
  47
  48 #ifdef HAVE_ICONV
  49     #include <iconv.h>
  50     #include "wx/thread.h"
  51 #endif
  52
  53 #include "wx/encconv.h"
  54 #include "wx/fontmap.h"
  55 #include "wx/utils.h"
  56
  57 #ifdef __WXMAC__
  58 #ifndef __DARWIN__
  59 #include <ATSUnicode.h>
  60 #include <TextCommon.h>
  61 #include <TextEncodingConverter.h>
  62 #endif
  63
  64 // includes Mac headers
  65 #include "wx/mac/private.h"
  66 #endif
  67
  68
  69 #define TRACE_STRCONV _T("strconv")
  70
  71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  72 // be 4 bytes
  73 #if SIZEOF_WCHAR_T == 2
  74     #define WC_UTF16
  75 #endif
  76
  77
  78 // ============================================================================
  79 // implementation
  80 // ============================================================================
  81
  82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  83 static bool NotAllNULs(const char *p, size_t n)
  84 {
  85     while ( n && *p++ == '\0' )
  86         n--;
  87
  88     return n != 0;
  89 }
  90
  91 // ----------------------------------------------------------------------------
  92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  93 // ----------------------------------------------------------------------------
  94
  95 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  96 {
  97     if (input <= 0xffff)
  98     {
  99         if (output)
 100             *output = (wxUint16) input;
 101
 102         return 1;
 103     }
 104     else if (input >= 0x110000)
 105     {
 106         return wxCONV_FAILED;
 107     }
 108     else
 109     {
 110         if (output)
 111         {
 112             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 113             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 114         }
 115
 116         return 2;
 117     }
 118 }
 119
 120 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 121 {
 122     if ((*input < 0xd800) || (*input > 0xdfff))
 123     {
 124         output = *input;
 125         return 1;
 126     }
 127     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 128     {
 129         output = *input;
 130         return wxCONV_FAILED;
 131     }
 132     else
 133     {
 134         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 135         return 2;
 136     }
 137 }
 138
 139 #ifdef WC_UTF16
 140     typedef wchar_t wxDecodeSurrogate_t;
 141 #else // !WC_UTF16
 142     typedef wxUint16 wxDecodeSurrogate_t;
 143 #endif // WC_UTF16/!WC_UTF16
 144
 145 // returns the next UTF-32 character from the wchar_t buffer and advances the
 146 // pointer to the character after this one
 147 //
 148 // if an invalid character is found, *pSrc is set to NULL, the caller must
 149 // check for this
 150 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 151 {
 152     wxUint32 out;
 153     const size_t
 154         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 155     if ( n == wxCONV_FAILED )
 156         *pSrc = NULL;
 157     else
 158         *pSrc += n;
 159
 160     return out;
 161 }
 162
 163 // ----------------------------------------------------------------------------
 164 // wxMBConv
 165 // ----------------------------------------------------------------------------
 166
 167 size_t
 168 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 169                   const char *src, size_t srcLen) const
 170 {
 171     // although new conversion classes are supposed to implement this function
 172     // directly, the existins ones only implement the old MB2WC() and so, to
 173     // avoid to have to rewrite all conversion classes at once, we provide a
 174     // default (but not efficient) implementation of this one in terms of the
 175     // old function by copying the input to ensure that it's NUL-terminated and
 176     // then using MB2WC() to convert it
 177
 178     // the number of chars [which would be] written to dst [if it were not NULL]
 179     size_t dstWritten = 0;
 180
 181     // the number of NULs terminating this string
 182     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 183
 184     // if we were not given the input size we just have to assume that the
 185     // string is properly terminated as we have no way of knowing how long it
 186     // is anyhow, but if we do have the size check whether there are enough
 187     // NULs at the end
 188     wxCharBuffer bufTmp;
 189     const char *srcEnd;
 190     if ( srcLen != wxNO_LEN )
 191     {
 192         // we need to know how to find the end of this string
 193         nulLen = GetMBNulLen();
 194         if ( nulLen == wxCONV_FAILED )
 195             return wxCONV_FAILED;
 196
 197         // if there are enough NULs we can avoid the copy
 198         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 199         {
 200             // make a copy in order to properly NUL-terminate the string
 201             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 202             char * const p = bufTmp.data();
 203             memcpy(p, src, srcLen);
 204             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 205                 *s = '\0';
 206
 207             src = bufTmp;
 208         }
 209
 210         srcEnd = src + srcLen;
 211     }
 212     else // quit after the first loop iteration
 213     {
 214         srcEnd = NULL;
 215     }
 216
 217     for ( ;; )
 218     {
 219         // try to convert the current chunk
 220         size_t lenChunk = MB2WC(NULL, src, 0);
 221         if ( lenChunk == wxCONV_FAILED )
 222             return wxCONV_FAILED;
 223
 224         lenChunk++; // for the L'\0' at the end of this chunk
 225
 226         dstWritten += lenChunk;
 227
 228         if ( lenChunk == 1 )
 229         {
 230             // nothing left in the input string, conversion succeeded
 231             break;
 232         }
 233
 234         if ( dst )
 235         {
 236             if ( dstWritten > dstLen )
 237                 return wxCONV_FAILED;
 238
 239             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 240                 return wxCONV_FAILED;
 241
 242             dst += lenChunk;
 243         }
 244
 245         if ( !srcEnd )
 246         {
 247             // we convert just one chunk in this case as this is the entire
 248             // string anyhow
 249             break;
 250         }
 251
 252         // advance the input pointer past the end of this chunk
 253         while ( NotAllNULs(src, nulLen) )
 254         {
 255             // notice that we must skip over multiple bytes here as we suppose
 256             // that if NUL takes 2 or 4 bytes, then all the other characters do
 257             // too and so if advanced by a single byte we might erroneously
 258             // detect sequences of NUL bytes in the middle of the input
 259             src += nulLen;
 260         }
 261
 262         src += nulLen; // skipping over its terminator as well
 263
 264         // note that ">=" (and not just "==") is needed here as the terminator
 265         // we skipped just above could be inside or just after the buffer
 266         // delimited by inEnd
 267         if ( src >= srcEnd )
 268             break;
 269     }
 270
 271     return dstWritten;
 272 }
 273
 274 size_t
 275 wxMBConv::FromWChar(char *dst, size_t dstLen,
 276                     const wchar_t *src, size_t srcLen) const
 277 {
 278     // the number of chars [which would be] written to dst [if it were not NULL]
 279     size_t dstWritten = 0;
 280
 281     // make a copy of the input string unless it is already properly
 282     // NUL-terminated
 283     //
 284     // if we don't know its length we have no choice but to assume that it is,
 285     // indeed, properly terminated
 286     wxWCharBuffer bufTmp;
 287     if ( srcLen == wxNO_LEN )
 288     {
 289         srcLen = wxWcslen(src) + 1;
 290     }
 291     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 292     {
 293         // make a copy in order to properly NUL-terminate the string
 294         bufTmp = wxWCharBuffer(srcLen);
 295         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 296         src = bufTmp;
 297     }
 298
 299     const size_t lenNul = GetMBNulLen();
 300     for ( const wchar_t * const srcEnd = src + srcLen;
 301           src < srcEnd;
 302           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 303     {
 304         // try to convert the current chunk
 305         size_t lenChunk = WC2MB(NULL, src, 0);
 306
 307         if ( lenChunk == wxCONV_FAILED )
 308             return wxCONV_FAILED;
 309
 310         lenChunk += lenNul;
 311         dstWritten += lenChunk;
 312
 313         if ( dst )
 314         {
 315             if ( dstWritten > dstLen )
 316                 return wxCONV_FAILED;
 317
 318             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 319                 return wxCONV_FAILED;
 320
 321             dst += lenChunk;
 322         }
 323     }
 324
 325     return dstWritten;
 326 }
 327
 328 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 329 {
 330     size_t rc = ToWChar(outBuff, outLen, inBuff);
 331     if ( rc != wxCONV_FAILED )
 332     {
 333         // ToWChar() returns the buffer length, i.e. including the trailing
 334         // NUL, while this method doesn't take it into account
 335         rc--;
 336     }
 337
 338     return rc;
 339 }
 340
 341 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 342 {
 343     size_t rc = FromWChar(outBuff, outLen, inBuff);
 344     if ( rc != wxCONV_FAILED )
 345     {
 346         rc -= GetMBNulLen();
 347     }
 348
 349     return rc;
 350 }
 351
 352 wxMBConv::~wxMBConv()
 353 {
 354     // nothing to do here (necessary for Darwin linking probably)
 355 }
 356
 357 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 358 {
 359     if ( psz )
 360     {
 361         // calculate the length of the buffer needed first
 362         const size_t nLen = MB2WC(NULL, psz, 0);
 363         if ( nLen != wxCONV_FAILED )
 364         {
 365             // now do the actual conversion
 366             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 367
 368             // +1 for the trailing NULL
 369             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 370                 return buf;
 371         }
 372     }
 373
 374     return wxWCharBuffer();
 375 }
 376
 377 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 378 {
 379     if ( pwz )
 380     {
 381         const size_t nLen = WC2MB(NULL, pwz, 0);
 382         if ( nLen != wxCONV_FAILED )
 383         {
 384             // extra space for trailing NUL(s)
 385             static const size_t extraLen = GetMaxMBNulLen();
 386
 387             wxCharBuffer buf(nLen + extraLen - 1);
 388             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 389                 return buf;
 390         }
 391     }
 392
 393     return wxCharBuffer();
 394 }
 395
 396 const wxWCharBuffer
 397 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 398 {
 399     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 400     if ( dstLen != wxCONV_FAILED )
 401     {
 402         wxWCharBuffer wbuf(dstLen - 1);
 403         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 404         {
 405             if ( outLen )
 406             {
 407                 *outLen = dstLen;
 408                 if ( wbuf[dstLen - 1] == L'\0' )
 409                     (*outLen)--;
 410             }
 411
 412             return wbuf;
 413         }
 414     }
 415
 416     if ( outLen )
 417         *outLen = 0;
 418
 419     return wxWCharBuffer();
 420 }
 421
 422 const wxCharBuffer
 423 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 424 {
 425     const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 426     if ( dstLen != wxCONV_FAILED )
 427     {
 428         wxCharBuffer buf(dstLen - 1);
 429         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 430         {
 431             if ( outLen )
 432             {
 433                 *outLen = dstLen;
 434
 435                 const size_t nulLen = GetMBNulLen();
 436                 if ( !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 437                 {
 438                     // in this case the output is NUL-terminated and we're not
 439                     // supposed to count NUL
 440                     (*outLen) -= nulLen;
 441                 }
 442             }
 443
 444             return buf;
 445         }
 446     }
 447
 448     if ( outLen )
 449         *outLen = 0;
 450
 451     return wxCharBuffer();
 452 }
 453
 454 // ----------------------------------------------------------------------------
 455 // wxMBConvLibc
 456 // ----------------------------------------------------------------------------
 457
 458 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 459 {
 460     return wxMB2WC(buf, psz, n);
 461 }
 462
 463 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 464 {
 465     return wxWC2MB(buf, psz, n);
 466 }
 467
 468 // ----------------------------------------------------------------------------
 469 // wxConvBrokenFileNames
 470 // ----------------------------------------------------------------------------
 471
 472 #ifdef __UNIX__
 473
 474 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 475 {
 476     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 477                   || wxStricmp(charset, _T("UTF8")) == 0  )
 478         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 479     else
 480         m_conv = new wxCSConv(charset);
 481 }
 482
 483 #endif // __UNIX__
 484
 485 // ----------------------------------------------------------------------------
 486 // UTF-7
 487 // ----------------------------------------------------------------------------
 488
 489 // Implementation (C) 2004 Fredrik Roubert
 490
 491 //
 492 // BASE64 decoding table
 493 //
 494 static const unsigned char utf7unb64[] =
 495 {
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 502     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 503     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 505     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 506     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 507     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 509     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 510     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 511     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 528 };
 529
 530 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 531 {
 532     size_t len = 0;
 533
 534     while ( *psz && (!buf || (len < n)) )
 535     {
 536         unsigned char cc = *psz++;
 537         if (cc != '+')
 538         {
 539             // plain ASCII char
 540             if (buf)
 541                 *buf++ = cc;
 542             len++;
 543         }
 544         else if (*psz == '-')
 545         {
 546             // encoded plus sign
 547             if (buf)
 548                 *buf++ = cc;
 549             len++;
 550             psz++;
 551         }
 552         else // start of BASE64 encoded string
 553         {
 554             bool lsb, ok;
 555             unsigned int d, l;
 556             for ( ok = lsb = false, d = 0, l = 0;
 557                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 558                   psz++ )
 559             {
 560                 d <<= 6;
 561                 d += cc;
 562                 for (l += 6; l >= 8; lsb = !lsb)
 563                 {
 564                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 565                     if (lsb)
 566                     {
 567                         if (buf)
 568                             *buf++ |= c;
 569                         len ++;
 570                     }
 571                     else
 572                     {
 573                         if (buf)
 574                             *buf = (wchar_t)(c << 8);
 575                     }
 576
 577                     ok = true;
 578                 }
 579             }
 580
 581             if ( !ok )
 582             {
 583                 // in valid UTF7 we should have valid characters after '+'
 584                 return wxCONV_FAILED;
 585             }
 586
 587             if (*psz == '-')
 588                 psz++;
 589         }
 590     }
 591
 592     if ( buf && (len < n) )
 593         *buf = '\0';
 594
 595     return len;
 596 }
 597
 598 //
 599 // BASE64 encoding table
 600 //
 601 static const unsigned char utf7enb64[] =
 602 {
 603     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 604     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 605     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 606     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 607     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 608     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 609     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 610     '4', '5', '6', '7', '8', '9', '+', '/'
 611 };
 612
 613 //
 614 // UTF-7 encoding table
 615 //
 616 // 0 - Set D (directly encoded characters)
 617 // 1 - Set O (optional direct characters)
 618 // 2 - whitespace characters (optional)
 619 // 3 - special characters
 620 //
 621 static const unsigned char utf7encode[128] =
 622 {
 623     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 624     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 625     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 626     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 627     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 628     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 629     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 630     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 631 };
 632
 633 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 634 {
 635     size_t len = 0;
 636
 637     while (*psz && ((!buf) || (len < n)))
 638     {
 639         wchar_t cc = *psz++;
 640         if (cc < 0x80 && utf7encode[cc] < 1)
 641         {
 642             // plain ASCII char
 643             if (buf)
 644                 *buf++ = (char)cc;
 645
 646             len++;
 647         }
 648 #ifndef WC_UTF16
 649         else if (((wxUint32)cc) > 0xffff)
 650         {
 651             // no surrogate pair generation (yet?)
 652             return wxCONV_FAILED;
 653         }
 654 #endif
 655         else
 656         {
 657             if (buf)
 658                 *buf++ = '+';
 659
 660             len++;
 661             if (cc != '+')
 662             {
 663                 // BASE64 encode string
 664                 unsigned int lsb, d, l;
 665                 for (d = 0, l = 0; /*nothing*/; psz++)
 666                 {
 667                     for (lsb = 0; lsb < 2; lsb ++)
 668                     {
 669                         d <<= 8;
 670                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 671
 672                         for (l += 8; l >= 6; )
 673                         {
 674                             l -= 6;
 675                             if (buf)
 676                                 *buf++ = utf7enb64[(d >> l) % 64];
 677                             len++;
 678                         }
 679                     }
 680
 681                     cc = *psz;
 682                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 683                         break;
 684                 }
 685
 686                 if (l != 0)
 687                 {
 688                     if (buf)
 689                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 690
 691                     len++;
 692                 }
 693             }
 694
 695             if (buf)
 696                 *buf++ = '-';
 697             len++;
 698         }
 699     }
 700
 701     if (buf && (len < n))
 702         *buf = 0;
 703
 704     return len;
 705 }
 706
 707 // ----------------------------------------------------------------------------
 708 // UTF-8
 709 // ----------------------------------------------------------------------------
 710
 711 static wxUint32 utf8_max[]=
 712     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 713
 714 // boundaries of the private use area we use to (temporarily) remap invalid
 715 // characters invalid in a UTF-8 encoded string
 716 const wxUint32 wxUnicodePUA = 0x100000;
 717 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 718
 719 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 720 {
 721     size_t len = 0;
 722
 723     while (*psz && ((!buf) || (len < n)))
 724     {
 725         const char *opsz = psz;
 726         bool invalid = false;
 727         unsigned char cc = *psz++, fc = cc;
 728         unsigned cnt;
 729         for (cnt = 0; fc & 0x80; cnt++)
 730             fc <<= 1;
 731
 732         if (!cnt)
 733         {
 734             // plain ASCII char
 735             if (buf)
 736                 *buf++ = cc;
 737             len++;
 738
 739             // escape the escape character for octal escapes
 740             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 741                     && cc == '\\' && (!buf || len < n))
 742             {
 743                 if (buf)
 744                     *buf++ = cc;
 745                 len++;
 746             }
 747         }
 748         else
 749         {
 750             cnt--;
 751             if (!cnt)
 752             {
 753                 // invalid UTF-8 sequence
 754                 invalid = true;
 755             }
 756             else
 757             {
 758                 unsigned ocnt = cnt - 1;
 759                 wxUint32 res = cc & (0x3f >> cnt);
 760                 while (cnt--)
 761                 {
 762                     cc = *psz;
 763                     if ((cc & 0xC0) != 0x80)
 764                     {
 765                         // invalid UTF-8 sequence
 766                         invalid = true;
 767                         break;
 768                     }
 769
 770                     psz++;
 771                     res = (res << 6) | (cc & 0x3f);
 772                 }
 773
 774                 if (invalid || res <= utf8_max[ocnt])
 775                 {
 776                     // illegal UTF-8 encoding
 777                     invalid = true;
 778                 }
 779                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 780                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 781                 {
 782                     // if one of our PUA characters turns up externally
 783                     // it must also be treated as an illegal sequence
 784                     // (a bit like you have to escape an escape character)
 785                     invalid = true;
 786                 }
 787                 else
 788                 {
 789 #ifdef WC_UTF16
 790                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 791                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 792                     if (pa == wxCONV_FAILED)
 793                     {
 794                         invalid = true;
 795                     }
 796                     else
 797                     {
 798                         if (buf)
 799                             buf += pa;
 800                         len += pa;
 801                     }
 802 #else // !WC_UTF16
 803                     if (buf)
 804                         *buf++ = (wchar_t)res;
 805                     len++;
 806 #endif // WC_UTF16/!WC_UTF16
 807                 }
 808             }
 809
 810             if (invalid)
 811             {
 812                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 813                 {
 814                     while (opsz < psz && (!buf || len < n))
 815                     {
 816 #ifdef WC_UTF16
 817                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 818                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 819                         wxASSERT(pa != wxCONV_FAILED);
 820                         if (buf)
 821                             buf += pa;
 822                         opsz++;
 823                         len += pa;
 824 #else
 825                         if (buf)
 826                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 827                         opsz++;
 828                         len++;
 829 #endif
 830                     }
 831                 }
 832                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 833                 {
 834                     while (opsz < psz && (!buf || len < n))
 835                     {
 836                         if ( buf && len + 3 < n )
 837                         {
 838                             unsigned char on = *opsz;
 839                             *buf++ = L'\\';
 840                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 841                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 842                             *buf++ = (wchar_t)( L'0' + on % 010 );
 843                         }
 844
 845                         opsz++;
 846                         len += 4;
 847                     }
 848                 }
 849                 else // MAP_INVALID_UTF8_NOT
 850                 {
 851                     return wxCONV_FAILED;
 852                 }
 853             }
 854         }
 855     }
 856
 857     if (buf && (len < n))
 858         *buf = 0;
 859
 860     return len;
 861 }
 862
 863 static inline bool isoctal(wchar_t wch)
 864 {
 865     return L'0' <= wch && wch <= L'7';
 866 }
 867
 868 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 869 {
 870     size_t len = 0;
 871
 872     while (*psz && ((!buf) || (len < n)))
 873     {
 874         wxUint32 cc;
 875
 876 #ifdef WC_UTF16
 877         // cast is ok for WC_UTF16
 878         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 879         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 880 #else
 881         cc = (*psz++) & 0x7fffffff;
 882 #endif
 883
 884         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 885                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 886         {
 887             if (buf)
 888                 *buf++ = (char)(cc - wxUnicodePUA);
 889             len++;
 890         }
 891         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 892                     && cc == L'\\' && psz[0] == L'\\' )
 893         {
 894             if (buf)
 895                 *buf++ = (char)cc;
 896             psz++;
 897             len++;
 898         }
 899         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 900                     cc == L'\\' &&
 901                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 902         {
 903             if (buf)
 904             {
 905                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 906                                  (psz[1] - L'0') * 010 +
 907                                  (psz[2] - L'0'));
 908             }
 909
 910             psz += 3;
 911             len++;
 912         }
 913         else
 914         {
 915             unsigned cnt;
 916             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 917             {
 918             }
 919
 920             if (!cnt)
 921             {
 922                 // plain ASCII char
 923                 if (buf)
 924                     *buf++ = (char) cc;
 925                 len++;
 926             }
 927             else
 928             {
 929                 len += cnt + 1;
 930                 if (buf)
 931                 {
 932                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 933                     while (cnt--)
 934                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 935                 }
 936             }
 937         }
 938     }
 939
 940     if (buf && (len < n))
 941         *buf = 0;
 942
 943     return len;
 944 }
 945
 946 // ============================================================================
 947 // UTF-16
 948 // ============================================================================
 949
 950 #ifdef WORDS_BIGENDIAN
 951     #define wxMBConvUTF16straight wxMBConvUTF16BE
 952     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 953 #else
 954     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 955     #define wxMBConvUTF16straight wxMBConvUTF16LE
 956 #endif
 957
 958 /* static */
 959 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 960 {
 961     if ( srcLen == wxNO_LEN )
 962     {
 963         // count the number of bytes in input, including the trailing NULs
 964         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 965         for ( srcLen = 1; *inBuff++; srcLen++ )
 966             ;
 967
 968         srcLen *= BYTES_PER_CHAR;
 969     }
 970     else // we already have the length
 971     {
 972         // we can only convert an entire number of UTF-16 characters
 973         if ( srcLen % BYTES_PER_CHAR )
 974             return wxCONV_FAILED;
 975     }
 976
 977     return srcLen;
 978 }
 979
 980 // case when in-memory representation is UTF-16 too
 981 #ifdef WC_UTF16
 982
 983 // ----------------------------------------------------------------------------
 984 // conversions without endianness change
 985 // ----------------------------------------------------------------------------
 986
 987 size_t
 988 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 989                                const char *src, size_t srcLen) const
 990 {
 991     // set up the scene for using memcpy() (which is presumably more efficient
 992     // than copying the bytes one by one)
 993     srcLen = GetLength(src, srcLen);
 994     if ( srcLen == wxNO_LEN )
 995         return wxCONV_FAILED;
 996
 997     const size_t inLen = srcLen / BYTES_PER_CHAR;
 998     if ( dst )
 999     {
1000         if ( dstLen < inLen )
1001             return wxCONV_FAILED;
1002
1003         memcpy(dst, src, srcLen);
1004     }
1005
1006     return inLen;
1007 }
1008
1009 size_t
1010 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1011                                  const wchar_t *src, size_t srcLen) const
1012 {
1013     if ( srcLen == wxNO_LEN )
1014         srcLen = wxWcslen(src) + 1;
1015
1016     srcLen *= BYTES_PER_CHAR;
1017
1018     if ( dst )
1019     {
1020         if ( dstLen < srcLen )
1021             return wxCONV_FAILED;
1022
1023         memcpy(dst, src, srcLen);
1024     }
1025
1026     return srcLen;
1027 }
1028
1029 // ----------------------------------------------------------------------------
1030 // endian-reversing conversions
1031 // ----------------------------------------------------------------------------
1032
1033 size_t
1034 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1035                            const char *src, size_t srcLen) const
1036 {
1037     srcLen = GetLength(src, srcLen);
1038     if ( srcLen == wxNO_LEN )
1039         return wxCONV_FAILED;
1040
1041     srcLen /= BYTES_PER_CHAR;
1042
1043     if ( dst )
1044     {
1045         if ( dstLen < srcLen )
1046             return wxCONV_FAILED;
1047
1048         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1049         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1050         {
1051             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1052         }
1053     }
1054
1055     return srcLen;
1056 }
1057
1058 size_t
1059 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1060                              const wchar_t *src, size_t srcLen) const
1061 {
1062     if ( srcLen == wxNO_LEN )
1063         srcLen = wxWcslen(src) + 1;
1064
1065     srcLen *= BYTES_PER_CHAR;
1066
1067     if ( dst )
1068     {
1069         if ( dstLen < srcLen )
1070             return wxCONV_FAILED;
1071
1072         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1073         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1074         {
1075             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1076         }
1077     }
1078
1079     return srcLen;
1080 }
1081
1082 #else // !WC_UTF16: wchar_t is UTF-32
1083
1084 // ----------------------------------------------------------------------------
1085 // conversions without endianness change
1086 // ----------------------------------------------------------------------------
1087
1088 size_t
1089 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1090                                const char *src, size_t srcLen) const
1091 {
1092     srcLen = GetLength(src, srcLen);
1093     if ( srcLen == wxNO_LEN )
1094         return wxCONV_FAILED;
1095
1096     const size_t inLen = srcLen / BYTES_PER_CHAR;
1097     if ( !dst )
1098     {
1099         // optimization: return maximal space which could be needed for this
1100         // string even if the real size could be smaller if the buffer contains
1101         // any surrogates
1102         return inLen;
1103     }
1104
1105     size_t outLen = 0;
1106     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1107     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1108     {
1109         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1110         if ( !inBuff )
1111             return wxCONV_FAILED;
1112
1113         if ( ++outLen > dstLen )
1114             return wxCONV_FAILED;
1115
1116         *dst++ = ch;
1117     }
1118
1119
1120     return outLen;
1121 }
1122
1123 size_t
1124 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1125                                  const wchar_t *src, size_t srcLen) const
1126 {
1127     if ( srcLen == wxNO_LEN )
1128         srcLen = wxWcslen(src) + 1;
1129
1130     size_t outLen = 0;
1131     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1132     for ( size_t n = 0; n < srcLen; n++ )
1133     {
1134         wxUint16 cc[2];
1135         const size_t numChars = encode_utf16(*src++, cc);
1136         if ( numChars == wxCONV_FAILED )
1137             return wxCONV_FAILED;
1138
1139         outLen += numChars * BYTES_PER_CHAR;
1140         if ( outBuff )
1141         {
1142             if ( outLen > dstLen )
1143                 return wxCONV_FAILED;
1144
1145             *outBuff++ = cc[0];
1146             if ( numChars == 2 )
1147             {
1148                 // second character of a surrogate
1149                 *outBuff++ = cc[1];
1150             }
1151         }
1152     }
1153
1154     return outLen;
1155 }
1156
1157 // ----------------------------------------------------------------------------
1158 // endian-reversing conversions
1159 // ----------------------------------------------------------------------------
1160
1161 size_t
1162 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1163                            const char *src, size_t srcLen) const
1164 {
1165     srcLen = GetLength(src, srcLen);
1166     if ( srcLen == wxNO_LEN )
1167         return wxCONV_FAILED;
1168
1169     const size_t inLen = srcLen / BYTES_PER_CHAR;
1170     if ( !dst )
1171     {
1172         // optimization: return maximal space which could be needed for this
1173         // string even if the real size could be smaller if the buffer contains
1174         // any surrogates
1175         return inLen;
1176     }
1177
1178     size_t outLen = 0;
1179     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1180     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1181     {
1182         wxUint32 ch;
1183         wxUint16 tmp[2];
1184
1185         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1186         inBuff++;
1187         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1188
1189         const size_t numChars = decode_utf16(tmp, ch);
1190         if ( numChars == wxCONV_FAILED )
1191             return wxCONV_FAILED;
1192
1193         if ( numChars == 2 )
1194             inBuff++;
1195
1196         if ( ++outLen > dstLen )
1197             return wxCONV_FAILED;
1198
1199         *dst++ = ch;
1200     }
1201
1202
1203     return outLen;
1204 }
1205
1206 size_t
1207 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1208                              const wchar_t *src, size_t srcLen) const
1209 {
1210     if ( srcLen == wxNO_LEN )
1211         srcLen = wxWcslen(src) + 1;
1212
1213     size_t outLen = 0;
1214     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1215     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1216     {
1217         wxUint16 cc[2];
1218         const size_t numChars = encode_utf16(*src, cc);
1219         if ( numChars == wxCONV_FAILED )
1220             return wxCONV_FAILED;
1221
1222         outLen += numChars * BYTES_PER_CHAR;
1223         if ( outBuff )
1224         {
1225             if ( outLen > dstLen )
1226                 return wxCONV_FAILED;
1227
1228             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1229             if ( numChars == 2 )
1230             {
1231                 // second character of a surrogate
1232                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1233             }
1234         }
1235     }
1236
1237     return outLen;
1238 }
1239
1240 #endif // WC_UTF16/!WC_UTF16
1241
1242
1243 // ============================================================================
1244 // UTF-32
1245 // ============================================================================
1246
1247 #ifdef WORDS_BIGENDIAN
1248     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1249     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1250 #else
1251     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1252     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1253 #endif
1254
1255
1256 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1257 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1258
1259 /* static */
1260 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1261 {
1262     if ( srcLen == wxNO_LEN )
1263     {
1264         // count the number of bytes in input, including the trailing NULs
1265         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1266         for ( srcLen = 1; *inBuff++; srcLen++ )
1267             ;
1268
1269         srcLen *= BYTES_PER_CHAR;
1270     }
1271     else // we already have the length
1272     {
1273         // we can only convert an entire number of UTF-32 characters
1274         if ( srcLen % BYTES_PER_CHAR )
1275             return wxCONV_FAILED;
1276     }
1277
1278     return srcLen;
1279 }
1280
1281 // case when in-memory representation is UTF-16
1282 #ifdef WC_UTF16
1283
1284 // ----------------------------------------------------------------------------
1285 // conversions without endianness change
1286 // ----------------------------------------------------------------------------
1287
1288 size_t
1289 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1290                                const char *src, size_t srcLen) const
1291 {
1292     srcLen = GetLength(src, srcLen);
1293     if ( srcLen == wxNO_LEN )
1294         return wxCONV_FAILED;
1295
1296     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1297     const size_t inLen = srcLen / BYTES_PER_CHAR;
1298     size_t outLen = 0;
1299     for ( size_t n = 0; n < inLen; n++ )
1300     {
1301         wxUint16 cc[2];
1302         const size_t numChars = encode_utf16(*inBuff++, cc);
1303         if ( numChars == wxCONV_FAILED )
1304             return wxCONV_FAILED;
1305
1306         outLen += numChars;
1307         if ( dst )
1308         {
1309             if ( outLen > dstLen )
1310                 return wxCONV_FAILED;
1311
1312             *dst++ = cc[0];
1313             if ( numChars == 2 )
1314             {
1315                 // second character of a surrogate
1316                 *dst++ = cc[1];
1317             }
1318         }
1319     }
1320
1321     return outLen;
1322 }
1323
1324 size_t
1325 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1326                                  const wchar_t *src, size_t srcLen) const
1327 {
1328     if ( srcLen == wxNO_LEN )
1329         srcLen = wxWcslen(src) + 1;
1330
1331     if ( !dst )
1332     {
1333         // optimization: return maximal space which could be needed for this
1334         // string instead of the exact amount which could be less if there are
1335         // any surrogates in the input
1336         //
1337         // we consider that surrogates are rare enough to make it worthwhile to
1338         // avoid running the loop below at the cost of slightly extra memory
1339         // consumption
1340         return srcLen * BYTES_PER_CHAR;
1341     }
1342
1343     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1344     size_t outLen = 0;
1345     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1346     {
1347         const wxUint32 ch = wxDecodeSurrogate(&src);
1348         if ( !src )
1349             return wxCONV_FAILED;
1350
1351         outLen += BYTES_PER_CHAR;
1352
1353         if ( outLen > dstLen )
1354             return wxCONV_FAILED;
1355
1356         *outBuff++ = ch;
1357     }
1358
1359     return outLen;
1360 }
1361
1362 // ----------------------------------------------------------------------------
1363 // endian-reversing conversions
1364 // ----------------------------------------------------------------------------
1365
1366 size_t
1367 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1368                            const char *src, size_t srcLen) const
1369 {
1370     srcLen = GetLength(src, srcLen);
1371     if ( srcLen == wxNO_LEN )
1372         return wxCONV_FAILED;
1373
1374     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1375     const size_t inLen = srcLen / BYTES_PER_CHAR;
1376     size_t outLen = 0;
1377     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1378     {
1379         wxUint16 cc[2];
1380         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1381         if ( numChars == wxCONV_FAILED )
1382             return wxCONV_FAILED;
1383
1384         outLen += numChars;
1385         if ( dst )
1386         {
1387             if ( outLen > dstLen )
1388                 return wxCONV_FAILED;
1389
1390             *dst++ = cc[0];
1391             if ( numChars == 2 )
1392             {
1393                 // second character of a surrogate
1394                 *dst++ = cc[1];
1395             }
1396         }
1397     }
1398
1399     return outLen;
1400 }
1401
1402 size_t
1403 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1404                              const wchar_t *src, size_t srcLen) const
1405 {
1406     if ( srcLen == wxNO_LEN )
1407         srcLen = wxWcslen(src) + 1;
1408
1409     if ( !dst )
1410     {
1411         // optimization: return maximal space which could be needed for this
1412         // string instead of the exact amount which could be less if there are
1413         // any surrogates in the input
1414         //
1415         // we consider that surrogates are rare enough to make it worthwhile to
1416         // avoid running the loop below at the cost of slightly extra memory
1417         // consumption
1418         return srcLen*BYTES_PER_CHAR;
1419     }
1420
1421     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1422     size_t outLen = 0;
1423     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1424     {
1425         const wxUint32 ch = wxDecodeSurrogate(&src);
1426         if ( !src )
1427             return wxCONV_FAILED;
1428
1429         outLen += BYTES_PER_CHAR;
1430
1431         if ( outLen > dstLen )
1432             return wxCONV_FAILED;
1433
1434         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1435     }
1436
1437     return outLen;
1438 }
1439
1440 #else // !WC_UTF16: wchar_t is UTF-32
1441
1442 // ----------------------------------------------------------------------------
1443 // conversions without endianness change
1444 // ----------------------------------------------------------------------------
1445
1446 size_t
1447 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1448                                const char *src, size_t srcLen) const
1449 {
1450     // use memcpy() as it should be much faster than hand-written loop
1451     srcLen = GetLength(src, srcLen);
1452     if ( srcLen == wxNO_LEN )
1453         return wxCONV_FAILED;
1454
1455     const size_t inLen = srcLen/BYTES_PER_CHAR;
1456     if ( dst )
1457     {
1458         if ( dstLen < inLen )
1459             return wxCONV_FAILED;
1460
1461         memcpy(dst, src, srcLen);
1462     }
1463
1464     return inLen;
1465 }
1466
1467 size_t
1468 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1469                                  const wchar_t *src, size_t srcLen) const
1470 {
1471     if ( srcLen == wxNO_LEN )
1472         srcLen = wxWcslen(src) + 1;
1473
1474     srcLen *= BYTES_PER_CHAR;
1475
1476     if ( dst )
1477     {
1478         if ( dstLen < srcLen )
1479             return wxCONV_FAILED;
1480
1481         memcpy(dst, src, srcLen);
1482     }
1483
1484     return srcLen;
1485 }
1486
1487 // ----------------------------------------------------------------------------
1488 // endian-reversing conversions
1489 // ----------------------------------------------------------------------------
1490
1491 size_t
1492 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1493                            const char *src, size_t srcLen) const
1494 {
1495     srcLen = GetLength(src, srcLen);
1496     if ( srcLen == wxNO_LEN )
1497         return wxCONV_FAILED;
1498
1499     srcLen /= BYTES_PER_CHAR;
1500
1501     if ( dst )
1502     {
1503         if ( dstLen < srcLen )
1504             return wxCONV_FAILED;
1505
1506         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1507         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1508         {
1509             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1510         }
1511     }
1512
1513     return srcLen;
1514 }
1515
1516 size_t
1517 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1518                              const wchar_t *src, size_t srcLen) const
1519 {
1520     if ( srcLen == wxNO_LEN )
1521         srcLen = wxWcslen(src) + 1;
1522
1523     srcLen *= BYTES_PER_CHAR;
1524
1525     if ( dst )
1526     {
1527         if ( dstLen < srcLen )
1528             return wxCONV_FAILED;
1529
1530         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1531         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1532         {
1533             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1534         }
1535     }
1536
1537     return srcLen;
1538 }
1539
1540 #endif // WC_UTF16/!WC_UTF16
1541
1542
1543 // ============================================================================
1544 // The classes doing conversion using the iconv_xxx() functions
1545 // ============================================================================
1546
1547 #ifdef HAVE_ICONV
1548
1549 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1550 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1551 //     (unless there's yet another bug in glibc) the only case when iconv()
1552 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1553 //     left in the input buffer -- when _real_ error occurs,
1554 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1555 //     iconv() failure.
1556 //     [This bug does not appear in glibc 2.2.]
1557 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1558 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1559                                      (errno != E2BIG || bufLeft != 0))
1560 #else
1561 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1562 #endif
1563
1564 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1565
1566 #define ICONV_T_INVALID ((iconv_t)-1)
1567
1568 #if SIZEOF_WCHAR_T == 4
1569     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1570     #define WC_ENC      wxFONTENCODING_UTF32
1571 #elif SIZEOF_WCHAR_T == 2
1572     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1573     #define WC_ENC      wxFONTENCODING_UTF16
1574 #else // sizeof(wchar_t) != 2 nor 4
1575     // does this ever happen?
1576     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1577 #endif
1578
1579 // ----------------------------------------------------------------------------
1580 // wxMBConv_iconv: encapsulates an iconv character set
1581 // ----------------------------------------------------------------------------
1582
1583 class wxMBConv_iconv : public wxMBConv
1584 {
1585 public:
1586     wxMBConv_iconv(const wxChar *name);
1587     virtual ~wxMBConv_iconv();
1588
1589     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1590     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1591
1592     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1593     virtual size_t GetMBNulLen() const;
1594
1595     virtual wxMBConv *Clone() const
1596     {
1597         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1598         p->m_minMBCharWidth = m_minMBCharWidth;
1599         return p;
1600     }
1601
1602     bool IsOk() const
1603         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1604
1605 protected:
1606     // the iconv handlers used to translate from multibyte
1607     // to wide char and in the other direction
1608     iconv_t m2w,
1609             w2m;
1610
1611 #if wxUSE_THREADS
1612     // guards access to m2w and w2m objects
1613     wxMutex m_iconvMutex;
1614 #endif
1615
1616 private:
1617     // the name (for iconv_open()) of a wide char charset -- if none is
1618     // available on this machine, it will remain NULL
1619     static wxString ms_wcCharsetName;
1620
1621     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1622     // different endian-ness than the native one
1623     static bool ms_wcNeedsSwap;
1624
1625
1626     // name of the encoding handled by this conversion
1627     wxString m_name;
1628
1629     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1630     // initially
1631     size_t m_minMBCharWidth;
1632 };
1633
1634 // make the constructor available for unit testing
1635 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1636 {
1637     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1638     if ( !result->IsOk() )
1639     {
1640         delete result;
1641         return 0;
1642     }
1643
1644     return result;
1645 }
1646
1647 wxString wxMBConv_iconv::ms_wcCharsetName;
1648 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1649
1650 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1651               : m_name(name)
1652 {
1653     m_minMBCharWidth = 0;
1654
1655     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1656     // names for the charsets
1657     const wxCharBuffer cname(wxString(name).ToAscii());
1658
1659     // check for charset that represents wchar_t:
1660     if ( ms_wcCharsetName.empty() )
1661     {
1662         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1663
1664 #if wxUSE_FONTMAP
1665         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1666 #else // !wxUSE_FONTMAP
1667         static const wxChar *names[] =
1668         {
1669 #if SIZEOF_WCHAR_T == 4
1670             _T("UCS-4"),
1671 #elif SIZEOF_WCHAR_T = 2
1672             _T("UCS-2"),
1673 #endif
1674             NULL
1675         };
1676 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1677
1678         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1679         {
1680             const wxString nameCS(*names);
1681
1682             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1683             wxString nameXE(nameCS);
1684
1685 #ifdef WORDS_BIGENDIAN
1686                 nameXE += _T("BE");
1687 #else // little endian
1688                 nameXE += _T("LE");
1689 #endif
1690
1691             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1692                        nameXE.c_str());
1693
1694             m2w = iconv_open(nameXE.ToAscii(), cname);
1695             if ( m2w == ICONV_T_INVALID )
1696             {
1697                 // try charset w/o bytesex info (e.g. "UCS4")
1698                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1699                            nameCS.c_str());
1700                 m2w = iconv_open(nameCS.ToAscii(), cname);
1701
1702                 // and check for bytesex ourselves:
1703                 if ( m2w != ICONV_T_INVALID )
1704                 {
1705                     char    buf[2], *bufPtr;
1706                     wchar_t wbuf[2], *wbufPtr;
1707                     size_t  insz, outsz;
1708                     size_t  res;
1709
1710                     buf[0] = 'A';
1711                     buf[1] = 0;
1712                     wbuf[0] = 0;
1713                     insz = 2;
1714                     outsz = SIZEOF_WCHAR_T * 2;
1715                     wbufPtr = wbuf;
1716                     bufPtr = buf;
1717
1718                     res = iconv(
1719                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1720                         (char**)&wbufPtr, &outsz);
1721
1722                     if (ICONV_FAILED(res, insz))
1723                     {
1724                         wxLogLastError(wxT("iconv"));
1725                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1726                                    nameCS.c_str());
1727                     }
1728                     else // ok, can convert to this encoding, remember it
1729                     {
1730                         ms_wcCharsetName = nameCS;
1731                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1732                     }
1733                 }
1734             }
1735             else // use charset not requiring byte swapping
1736             {
1737                 ms_wcCharsetName = nameXE;
1738             }
1739         }
1740
1741         wxLogTrace(TRACE_STRCONV,
1742                    wxT("iconv wchar_t charset is \"%s\"%s"),
1743                    ms_wcCharsetName.empty() ? _T("<none>")
1744                                             : ms_wcCharsetName.c_str(),
1745                    ms_wcNeedsSwap ? _T(" (needs swap)")
1746                                   : _T(""));
1747     }
1748     else // we already have ms_wcCharsetName
1749     {
1750         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1751     }
1752
1753     if ( ms_wcCharsetName.empty() )
1754     {
1755         w2m = ICONV_T_INVALID;
1756     }
1757     else
1758     {
1759         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1760         if ( w2m == ICONV_T_INVALID )
1761         {
1762             wxLogTrace(TRACE_STRCONV,
1763                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1764                        ms_wcCharsetName.c_str(), cname.data());
1765         }
1766     }
1767 }
1768
1769 wxMBConv_iconv::~wxMBConv_iconv()
1770 {
1771     if ( m2w != ICONV_T_INVALID )
1772         iconv_close(m2w);
1773     if ( w2m != ICONV_T_INVALID )
1774         iconv_close(w2m);
1775 }
1776
1777 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1778 {
1779     // find the string length: notice that must be done differently for
1780     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1781     size_t inbuf;
1782     const size_t nulLen = GetMBNulLen();
1783     switch ( nulLen )
1784     {
1785         default:
1786             return wxCONV_FAILED;
1787
1788         case 1:
1789             inbuf = strlen(psz); // arguably more optimized than our version
1790             break;
1791
1792         case 2:
1793         case 4:
1794             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1795             // they also have to start at character boundary and not span two
1796             // adjacent characters
1797             const char *p;
1798             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1799                 ;
1800             inbuf = p - psz;
1801             break;
1802     }
1803
1804 #if wxUSE_THREADS
1805     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1806     //     Unfortunately there is a couple of global wxCSConv objects such as
1807     //     wxConvLocal that are used all over wx code, so we have to make sure
1808     //     the handle is used by at most one thread at the time. Otherwise
1809     //     only a few wx classes would be safe to use from non-main threads
1810     //     as MB<->WC conversion would fail "randomly".
1811     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1812 #endif // wxUSE_THREADS
1813
1814     size_t outbuf = n * SIZEOF_WCHAR_T;
1815     size_t res, cres;
1816     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1817     wchar_t *bufPtr = buf;
1818     const char *pszPtr = psz;
1819
1820     if (buf)
1821     {
1822         // have destination buffer, convert there
1823         cres = iconv(m2w,
1824                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1825                      (char**)&bufPtr, &outbuf);
1826         res = n - (outbuf / SIZEOF_WCHAR_T);
1827
1828         if (ms_wcNeedsSwap)
1829         {
1830             // convert to native endianness
1831             for ( unsigned i = 0; i < res; i++ )
1832                 buf[n] = WC_BSWAP(buf[i]);
1833         }
1834
1835         // NUL-terminate the string if there is any space left
1836         if (res < n)
1837             buf[res] = 0;
1838     }
1839     else
1840     {
1841         // no destination buffer... convert using temp buffer
1842         // to calculate destination buffer requirement
1843         wchar_t tbuf[8];
1844         res = 0;
1845
1846         do
1847         {
1848             bufPtr = tbuf;
1849             outbuf = 8 * SIZEOF_WCHAR_T;
1850
1851             cres = iconv(m2w,
1852                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1853                          (char**)&bufPtr, &outbuf );
1854
1855             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1856         }
1857         while ((cres == (size_t)-1) && (errno == E2BIG));
1858     }
1859
1860     if (ICONV_FAILED(cres, inbuf))
1861     {
1862         //VS: it is ok if iconv fails, hence trace only
1863         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1864         return wxCONV_FAILED;
1865     }
1866
1867     return res;
1868 }
1869
1870 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1871 {
1872 #if wxUSE_THREADS
1873     // NB: explained in MB2WC
1874     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1875 #endif
1876
1877     size_t inlen = wxWcslen(psz);
1878     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1879     size_t outbuf = n;
1880     size_t res, cres;
1881
1882     wchar_t *tmpbuf = 0;
1883
1884     if (ms_wcNeedsSwap)
1885     {
1886         // need to copy to temp buffer to switch endianness
1887         // (doing WC_BSWAP twice on the original buffer won't help, as it
1888         //  could be in read-only memory, or be accessed in some other thread)
1889         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1890         for ( size_t i = 0; i < inlen; i++ )
1891             tmpbuf[n] = WC_BSWAP(psz[i]);
1892
1893         tmpbuf[inlen] = L'\0';
1894         psz = tmpbuf;
1895     }
1896
1897     if (buf)
1898     {
1899         // have destination buffer, convert there
1900         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1901
1902         res = n - outbuf;
1903
1904         // NB: iconv was given only wcslen(psz) characters on input, and so
1905         //     it couldn't convert the trailing zero. Let's do it ourselves
1906         //     if there's some room left for it in the output buffer.
1907         if (res < n)
1908             buf[0] = 0;
1909     }
1910     else
1911     {
1912         // no destination buffer: convert using temp buffer
1913         // to calculate destination buffer requirement
1914         char tbuf[16];
1915         res = 0;
1916         do
1917         {
1918             buf = tbuf;
1919             outbuf = 16;
1920
1921             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1922
1923             res += 16 - outbuf;
1924         }
1925         while ((cres == (size_t)-1) && (errno == E2BIG));
1926     }
1927
1928     if (ms_wcNeedsSwap)
1929     {
1930         free(tmpbuf);
1931     }
1932
1933     if (ICONV_FAILED(cres, inbuf))
1934     {
1935         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1936         return wxCONV_FAILED;
1937     }
1938
1939     return res;
1940 }
1941
1942 size_t wxMBConv_iconv::GetMBNulLen() const
1943 {
1944     if ( m_minMBCharWidth == 0 )
1945     {
1946         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1947
1948 #if wxUSE_THREADS
1949         // NB: explained in MB2WC
1950         wxMutexLocker lock(self->m_iconvMutex);
1951 #endif
1952
1953         wchar_t *wnul = L"";
1954         char buf[8]; // should be enough for NUL in any encoding
1955         size_t inLen = sizeof(wchar_t),
1956                outLen = WXSIZEOF(buf);
1957         char *inBuff = (char *)wnul;
1958         char *outBuff = buf;
1959         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1960         {
1961             self->m_minMBCharWidth = (size_t)-1;
1962         }
1963         else // ok
1964         {
1965             self->m_minMBCharWidth = outBuff - buf;
1966         }
1967     }
1968
1969     return m_minMBCharWidth;
1970 }
1971
1972 #endif // HAVE_ICONV
1973
1974
1975 // ============================================================================
1976 // Win32 conversion classes
1977 // ============================================================================
1978
1979 #ifdef wxHAVE_WIN32_MB2WC
1980
1981 // from utils.cpp
1982 #if wxUSE_FONTMAP
1983 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1984 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1985 #endif
1986
1987 class wxMBConv_win32 : public wxMBConv
1988 {
1989 public:
1990     wxMBConv_win32()
1991     {
1992         m_CodePage = CP_ACP;
1993         m_minMBCharWidth = 0;
1994     }
1995
1996     wxMBConv_win32(const wxMBConv_win32& conv)
1997     {
1998         m_CodePage = conv.m_CodePage;
1999         m_minMBCharWidth = conv.m_minMBCharWidth;
2000     }
2001
2002 #if wxUSE_FONTMAP
2003     wxMBConv_win32(const wxChar* name)
2004     {
2005         m_CodePage = wxCharsetToCodepage(name);
2006         m_minMBCharWidth = 0;
2007     }
2008
2009     wxMBConv_win32(wxFontEncoding encoding)
2010     {
2011         m_CodePage = wxEncodingToCodepage(encoding);
2012         m_minMBCharWidth = 0;
2013     }
2014 #endif // wxUSE_FONTMAP
2015
2016     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2017     {
2018         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2019         // the behaviour is not compatible with the Unix version (using iconv)
2020         // and break the library itself, e.g. wxTextInputStream::NextChar()
2021         // wouldn't work if reading an incomplete MB char didn't result in an
2022         // error
2023         //
2024         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2025         // Win XP or newer and it is not supported for UTF-[78] so we always
2026         // use our own conversions in this case. See
2027         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2028         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2029         if ( m_CodePage == CP_UTF8 )
2030         {
2031             return wxConvUTF8.MB2WC(buf, psz, n);
2032         }
2033
2034         if ( m_CodePage == CP_UTF7 )
2035         {
2036             return wxConvUTF7.MB2WC(buf, psz, n);
2037         }
2038
2039         int flags = 0;
2040         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2041                 IsAtLeastWin2kSP4() )
2042         {
2043             flags = MB_ERR_INVALID_CHARS;
2044         }
2045
2046         const size_t len = ::MultiByteToWideChar
2047                              (
2048                                 m_CodePage,     // code page
2049                                 flags,          // flags: fall on error
2050                                 psz,            // input string
2051                                 -1,             // its length (NUL-terminated)
2052                                 buf,            // output string
2053                                 buf ? n : 0     // size of output buffer
2054                              );
2055         if ( !len )
2056         {
2057             // function totally failed
2058             return wxCONV_FAILED;
2059         }
2060
2061         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2062         // check if we succeeded, by doing a double trip:
2063         if ( !flags && buf )
2064         {
2065             const size_t mbLen = strlen(psz);
2066             wxCharBuffer mbBuf(mbLen);
2067             if ( ::WideCharToMultiByte
2068                    (
2069                       m_CodePage,
2070                       0,
2071                       buf,
2072                       -1,
2073                       mbBuf.data(),
2074                       mbLen + 1,        // size in bytes, not length
2075                       NULL,
2076                       NULL
2077                    ) == 0 ||
2078                   strcmp(mbBuf, psz) != 0 )
2079             {
2080                 // we didn't obtain the same thing we started from, hence
2081                 // the conversion was lossy and we consider that it failed
2082                 return wxCONV_FAILED;
2083             }
2084         }
2085
2086         // note that it returns count of written chars for buf != NULL and size
2087         // of the needed buffer for buf == NULL so in either case the length of
2088         // the string (which never includes the terminating NUL) is one less
2089         return len - 1;
2090     }
2091
2092     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2093     {
2094         /*
2095             we have a problem here: by default, WideCharToMultiByte() may
2096             replace characters unrepresentable in the target code page with bad
2097             quality approximations such as turning "1/2" symbol (U+00BD) into
2098             "1" for the code pages which don't have it and we, obviously, want
2099             to avoid this at any price
2100
2101             the trouble is that this function does it _silently_, i.e. it won't
2102             even tell us whether it did or not... Win98/2000 and higher provide
2103             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2104             we have to resort to a round trip, i.e. check that converting back
2105             results in the same string -- this is, of course, expensive but
2106             otherwise we simply can't be sure to not garble the data.
2107          */
2108
2109         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2110         // it doesn't work with CJK encodings (which we test for rather roughly
2111         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2112         // supporting it
2113         BOOL usedDef wxDUMMY_INITIALIZE(false);
2114         BOOL *pUsedDef;
2115         int flags;
2116         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2117         {
2118             // it's our lucky day
2119             flags = WC_NO_BEST_FIT_CHARS;
2120             pUsedDef = &usedDef;
2121         }
2122         else // old system or unsupported encoding
2123         {
2124             flags = 0;
2125             pUsedDef = NULL;
2126         }
2127
2128         const size_t len = ::WideCharToMultiByte
2129                              (
2130                                 m_CodePage,     // code page
2131                                 flags,          // either none or no best fit
2132                                 pwz,            // input string
2133                                 -1,             // it is (wide) NUL-terminated
2134                                 buf,            // output buffer
2135                                 buf ? n : 0,    // and its size
2136                                 NULL,           // default "replacement" char
2137                                 pUsedDef        // [out] was it used?
2138                              );
2139
2140         if ( !len )
2141         {
2142             // function totally failed
2143             return wxCONV_FAILED;
2144         }
2145
2146         // if we were really converting, check if we succeeded
2147         if ( buf )
2148         {
2149             if ( flags )
2150             {
2151                 // check if the conversion failed, i.e. if any replacements
2152                 // were done
2153                 if ( usedDef )
2154                     return wxCONV_FAILED;
2155             }
2156             else // we must resort to double tripping...
2157             {
2158                 wxWCharBuffer wcBuf(n);
2159                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2160                         wcscmp(wcBuf, pwz) != 0 )
2161                 {
2162                     // we didn't obtain the same thing we started from, hence
2163                     // the conversion was lossy and we consider that it failed
2164                     return wxCONV_FAILED;
2165                 }
2166             }
2167         }
2168
2169         // see the comment above for the reason of "len - 1"
2170         return len - 1;
2171     }
2172
2173     virtual size_t GetMBNulLen() const
2174     {
2175         if ( m_minMBCharWidth == 0 )
2176         {
2177             int len = ::WideCharToMultiByte
2178                         (
2179                             m_CodePage,     // code page
2180                             0,              // no flags
2181                             L"",            // input string
2182                             1,              // translate just the NUL
2183                             NULL,           // output buffer
2184                             0,              // and its size
2185                             NULL,           // no replacement char
2186                             NULL            // [out] don't care if it was used
2187                         );
2188
2189             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2190             switch ( len )
2191             {
2192                 default:
2193                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2194                     self->m_minMBCharWidth = (size_t)-1;
2195                     break;
2196
2197                 case 0:
2198                     self->m_minMBCharWidth = (size_t)-1;
2199                     break;
2200
2201                 case 1:
2202                 case 2:
2203                 case 4:
2204                     self->m_minMBCharWidth = len;
2205                     break;
2206             }
2207         }
2208
2209         return m_minMBCharWidth;
2210     }
2211
2212     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2213
2214     bool IsOk() const { return m_CodePage != -1; }
2215
2216 private:
2217     static bool CanUseNoBestFit()
2218     {
2219         static int s_isWin98Or2k = -1;
2220
2221         if ( s_isWin98Or2k == -1 )
2222         {
2223             int verMaj, verMin;
2224             switch ( wxGetOsVersion(&verMaj, &verMin) )
2225             {
2226                 case wxWIN95:
2227                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2228                     break;
2229
2230                 case wxWINDOWS_NT:
2231                     s_isWin98Or2k = verMaj >= 5;
2232                     break;
2233
2234                 default:
2235                     // unknown: be conservative by default
2236                     s_isWin98Or2k = 0;
2237                     break;
2238             }
2239
2240             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2241         }
2242
2243         return s_isWin98Or2k == 1;
2244     }
2245
2246     static bool IsAtLeastWin2kSP4()
2247     {
2248 #ifdef __WXWINCE__
2249         return false;
2250 #else
2251         static int s_isAtLeastWin2kSP4 = -1;
2252
2253         if ( s_isAtLeastWin2kSP4 == -1 )
2254         {
2255             OSVERSIONINFOEX ver;
2256
2257             memset(&ver, 0, sizeof(ver));
2258             ver.dwOSVersionInfoSize = sizeof(ver);
2259             GetVersionEx((OSVERSIONINFO*)&ver);
2260
2261             s_isAtLeastWin2kSP4 =
2262               ((ver.dwMajorVersion > 5) || // Vista+
2263                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2264                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2265                ver.wServicePackMajor >= 4)) // 2000 SP4+
2266               ? 1 : 0;
2267         }
2268
2269         return s_isAtLeastWin2kSP4 == 1;
2270 #endif
2271     }
2272
2273
2274     // the code page we're working with
2275     long m_CodePage;
2276
2277     // cached result of GetMBNulLen(), set to 0 initially meaning
2278     // "unknown"
2279     size_t m_minMBCharWidth;
2280 };
2281
2282 #endif // wxHAVE_WIN32_MB2WC
2283
2284 // ============================================================================
2285 // Cocoa conversion classes
2286 // ============================================================================
2287
2288 #if defined(__WXCOCOA__)
2289
2290 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2291 // Strangely enough, internally Core Foundation uses
2292 // UTF-32 internally quite a bit - its just not public (yet).
2293
2294 #include <CoreFoundation/CFString.h>
2295 #include <CoreFoundation/CFStringEncodingExt.h>
2296
2297 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2298 {
2299     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2300
2301     switch (encoding)
2302     {
2303         case wxFONTENCODING_DEFAULT :
2304             enc = CFStringGetSystemEncoding();
2305             break ;
2306
2307         case wxFONTENCODING_ISO8859_1 :
2308             enc = kCFStringEncodingISOLatin1 ;
2309             break ;
2310         case wxFONTENCODING_ISO8859_2 :
2311             enc = kCFStringEncodingISOLatin2;
2312             break ;
2313         case wxFONTENCODING_ISO8859_3 :
2314             enc = kCFStringEncodingISOLatin3 ;
2315             break ;
2316         case wxFONTENCODING_ISO8859_4 :
2317             enc = kCFStringEncodingISOLatin4;
2318             break ;
2319         case wxFONTENCODING_ISO8859_5 :
2320             enc = kCFStringEncodingISOLatinCyrillic;
2321             break ;
2322         case wxFONTENCODING_ISO8859_6 :
2323             enc = kCFStringEncodingISOLatinArabic;
2324             break ;
2325         case wxFONTENCODING_ISO8859_7 :
2326             enc = kCFStringEncodingISOLatinGreek;
2327             break ;
2328         case wxFONTENCODING_ISO8859_8 :
2329             enc = kCFStringEncodingISOLatinHebrew;
2330             break ;
2331         case wxFONTENCODING_ISO8859_9 :
2332             enc = kCFStringEncodingISOLatin5;
2333             break ;
2334         case wxFONTENCODING_ISO8859_10 :
2335             enc = kCFStringEncodingISOLatin6;
2336             break ;
2337         case wxFONTENCODING_ISO8859_11 :
2338             enc = kCFStringEncodingISOLatinThai;
2339             break ;
2340         case wxFONTENCODING_ISO8859_13 :
2341             enc = kCFStringEncodingISOLatin7;
2342             break ;
2343         case wxFONTENCODING_ISO8859_14 :
2344             enc = kCFStringEncodingISOLatin8;
2345             break ;
2346         case wxFONTENCODING_ISO8859_15 :
2347             enc = kCFStringEncodingISOLatin9;
2348             break ;
2349
2350         case wxFONTENCODING_KOI8 :
2351             enc = kCFStringEncodingKOI8_R;
2352             break ;
2353         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2354             enc = kCFStringEncodingDOSRussian;
2355             break ;
2356
2357 //      case wxFONTENCODING_BULGARIAN :
2358 //          enc = ;
2359 //          break ;
2360
2361         case wxFONTENCODING_CP437 :
2362             enc = kCFStringEncodingDOSLatinUS ;
2363             break ;
2364         case wxFONTENCODING_CP850 :
2365             enc = kCFStringEncodingDOSLatin1;
2366             break ;
2367         case wxFONTENCODING_CP852 :
2368             enc = kCFStringEncodingDOSLatin2;
2369             break ;
2370         case wxFONTENCODING_CP855 :
2371             enc = kCFStringEncodingDOSCyrillic;
2372             break ;
2373         case wxFONTENCODING_CP866 :
2374             enc = kCFStringEncodingDOSRussian ;
2375             break ;
2376         case wxFONTENCODING_CP874 :
2377             enc = kCFStringEncodingDOSThai;
2378             break ;
2379         case wxFONTENCODING_CP932 :
2380             enc = kCFStringEncodingDOSJapanese;
2381             break ;
2382         case wxFONTENCODING_CP936 :
2383             enc = kCFStringEncodingDOSChineseSimplif ;
2384             break ;
2385         case wxFONTENCODING_CP949 :
2386             enc = kCFStringEncodingDOSKorean;
2387             break ;
2388         case wxFONTENCODING_CP950 :
2389             enc = kCFStringEncodingDOSChineseTrad;
2390             break ;
2391         case wxFONTENCODING_CP1250 :
2392             enc = kCFStringEncodingWindowsLatin2;
2393             break ;
2394         case wxFONTENCODING_CP1251 :
2395             enc = kCFStringEncodingWindowsCyrillic ;
2396             break ;
2397         case wxFONTENCODING_CP1252 :
2398             enc = kCFStringEncodingWindowsLatin1 ;
2399             break ;
2400         case wxFONTENCODING_CP1253 :
2401             enc = kCFStringEncodingWindowsGreek;
2402             break ;
2403         case wxFONTENCODING_CP1254 :
2404             enc = kCFStringEncodingWindowsLatin5;
2405             break ;
2406         case wxFONTENCODING_CP1255 :
2407             enc = kCFStringEncodingWindowsHebrew ;
2408             break ;
2409         case wxFONTENCODING_CP1256 :
2410             enc = kCFStringEncodingWindowsArabic ;
2411             break ;
2412         case wxFONTENCODING_CP1257 :
2413             enc = kCFStringEncodingWindowsBalticRim;
2414             break ;
2415 //   This only really encodes to UTF7 (if that) evidently
2416 //        case wxFONTENCODING_UTF7 :
2417 //            enc = kCFStringEncodingNonLossyASCII ;
2418 //            break ;
2419         case wxFONTENCODING_UTF8 :
2420             enc = kCFStringEncodingUTF8 ;
2421             break ;
2422         case wxFONTENCODING_EUC_JP :
2423             enc = kCFStringEncodingEUC_JP;
2424             break ;
2425         case wxFONTENCODING_UTF16 :
2426             enc = kCFStringEncodingUnicode ;
2427             break ;
2428         case wxFONTENCODING_MACROMAN :
2429             enc = kCFStringEncodingMacRoman ;
2430             break ;
2431         case wxFONTENCODING_MACJAPANESE :
2432             enc = kCFStringEncodingMacJapanese ;
2433             break ;
2434         case wxFONTENCODING_MACCHINESETRAD :
2435             enc = kCFStringEncodingMacChineseTrad ;
2436             break ;
2437         case wxFONTENCODING_MACKOREAN :
2438             enc = kCFStringEncodingMacKorean ;
2439             break ;
2440         case wxFONTENCODING_MACARABIC :
2441             enc = kCFStringEncodingMacArabic ;
2442             break ;
2443         case wxFONTENCODING_MACHEBREW :
2444             enc = kCFStringEncodingMacHebrew ;
2445             break ;
2446         case wxFONTENCODING_MACGREEK :
2447             enc = kCFStringEncodingMacGreek ;
2448             break ;
2449         case wxFONTENCODING_MACCYRILLIC :
2450             enc = kCFStringEncodingMacCyrillic ;
2451             break ;
2452         case wxFONTENCODING_MACDEVANAGARI :
2453             enc = kCFStringEncodingMacDevanagari ;
2454             break ;
2455         case wxFONTENCODING_MACGURMUKHI :
2456             enc = kCFStringEncodingMacGurmukhi ;
2457             break ;
2458         case wxFONTENCODING_MACGUJARATI :
2459             enc = kCFStringEncodingMacGujarati ;
2460             break ;
2461         case wxFONTENCODING_MACORIYA :
2462             enc = kCFStringEncodingMacOriya ;
2463             break ;
2464         case wxFONTENCODING_MACBENGALI :
2465             enc = kCFStringEncodingMacBengali ;
2466             break ;
2467         case wxFONTENCODING_MACTAMIL :
2468             enc = kCFStringEncodingMacTamil ;
2469             break ;
2470         case wxFONTENCODING_MACTELUGU :
2471             enc = kCFStringEncodingMacTelugu ;
2472             break ;
2473         case wxFONTENCODING_MACKANNADA :
2474             enc = kCFStringEncodingMacKannada ;
2475             break ;
2476         case wxFONTENCODING_MACMALAJALAM :
2477             enc = kCFStringEncodingMacMalayalam ;
2478             break ;
2479         case wxFONTENCODING_MACSINHALESE :
2480             enc = kCFStringEncodingMacSinhalese ;
2481             break ;
2482         case wxFONTENCODING_MACBURMESE :
2483             enc = kCFStringEncodingMacBurmese ;
2484             break ;
2485         case wxFONTENCODING_MACKHMER :
2486             enc = kCFStringEncodingMacKhmer ;
2487             break ;
2488         case wxFONTENCODING_MACTHAI :
2489             enc = kCFStringEncodingMacThai ;
2490             break ;
2491         case wxFONTENCODING_MACLAOTIAN :
2492             enc = kCFStringEncodingMacLaotian ;
2493             break ;
2494         case wxFONTENCODING_MACGEORGIAN :
2495             enc = kCFStringEncodingMacGeorgian ;
2496             break ;
2497         case wxFONTENCODING_MACARMENIAN :
2498             enc = kCFStringEncodingMacArmenian ;
2499             break ;
2500         case wxFONTENCODING_MACCHINESESIMP :
2501             enc = kCFStringEncodingMacChineseSimp ;
2502             break ;
2503         case wxFONTENCODING_MACTIBETAN :
2504             enc = kCFStringEncodingMacTibetan ;
2505             break ;
2506         case wxFONTENCODING_MACMONGOLIAN :
2507             enc = kCFStringEncodingMacMongolian ;
2508             break ;
2509         case wxFONTENCODING_MACETHIOPIC :
2510             enc = kCFStringEncodingMacEthiopic ;
2511             break ;
2512         case wxFONTENCODING_MACCENTRALEUR :
2513             enc = kCFStringEncodingMacCentralEurRoman ;
2514             break ;
2515         case wxFONTENCODING_MACVIATNAMESE :
2516             enc = kCFStringEncodingMacVietnamese ;
2517             break ;
2518         case wxFONTENCODING_MACARABICEXT :
2519             enc = kCFStringEncodingMacExtArabic ;
2520             break ;
2521         case wxFONTENCODING_MACSYMBOL :
2522             enc = kCFStringEncodingMacSymbol ;
2523             break ;
2524         case wxFONTENCODING_MACDINGBATS :
2525             enc = kCFStringEncodingMacDingbats ;
2526             break ;
2527         case wxFONTENCODING_MACTURKISH :
2528             enc = kCFStringEncodingMacTurkish ;
2529             break ;
2530         case wxFONTENCODING_MACCROATIAN :
2531             enc = kCFStringEncodingMacCroatian ;
2532             break ;
2533         case wxFONTENCODING_MACICELANDIC :
2534             enc = kCFStringEncodingMacIcelandic ;
2535             break ;
2536         case wxFONTENCODING_MACROMANIAN :
2537             enc = kCFStringEncodingMacRomanian ;
2538             break ;
2539         case wxFONTENCODING_MACCELTIC :
2540             enc = kCFStringEncodingMacCeltic ;
2541             break ;
2542         case wxFONTENCODING_MACGAELIC :
2543             enc = kCFStringEncodingMacGaelic ;
2544             break ;
2545 //      case wxFONTENCODING_MACKEYBOARD :
2546 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2547 //          break ;
2548
2549         default :
2550             // because gcc is picky
2551             break ;
2552     }
2553
2554     return enc ;
2555 }
2556
2557 class wxMBConv_cocoa : public wxMBConv
2558 {
2559 public:
2560     wxMBConv_cocoa()
2561     {
2562         Init(CFStringGetSystemEncoding()) ;
2563     }
2564
2565     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2566     {
2567         m_encoding = conv.m_encoding;
2568     }
2569
2570 #if wxUSE_FONTMAP
2571     wxMBConv_cocoa(const wxChar* name)
2572     {
2573         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2574     }
2575 #endif
2576
2577     wxMBConv_cocoa(wxFontEncoding encoding)
2578     {
2579         Init( wxCFStringEncFromFontEnc(encoding) );
2580     }
2581
2582     ~wxMBConv_cocoa()
2583     {
2584     }
2585
2586     void Init( CFStringEncoding encoding)
2587     {
2588         m_encoding = encoding ;
2589     }
2590
2591     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2592     {
2593         wxASSERT(szUnConv);
2594
2595         CFStringRef theString = CFStringCreateWithBytes (
2596                                                 NULL, //the allocator
2597                                                 (const UInt8*)szUnConv,
2598                                                 strlen(szUnConv),
2599                                                 m_encoding,
2600                                                 false //no BOM/external representation
2601                                                 );
2602
2603         wxASSERT(theString);
2604
2605         size_t nOutLength = CFStringGetLength(theString);
2606
2607         if (szOut == NULL)
2608         {
2609             CFRelease(theString);
2610             return nOutLength;
2611         }
2612
2613         CFRange theRange = { 0, nOutSize };
2614
2615 #if SIZEOF_WCHAR_T == 4
2616         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2617 #endif
2618
2619         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2620
2621         CFRelease(theString);
2622
2623         szUniCharBuffer[nOutLength] = '\0';
2624
2625 #if SIZEOF_WCHAR_T == 4
2626         wxMBConvUTF16 converter;
2627         converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2628         delete [] szUniCharBuffer;
2629 #endif
2630
2631         return nOutLength;
2632     }
2633
2634     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2635     {
2636         wxASSERT(szUnConv);
2637
2638         size_t nRealOutSize;
2639         size_t nBufSize = wxWcslen(szUnConv);
2640         UniChar* szUniBuffer = (UniChar*) szUnConv;
2641
2642 #if SIZEOF_WCHAR_T == 4
2643         wxMBConvUTF16 converter ;
2644         nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2645         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2646         converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2647         nBufSize /= sizeof(UniChar);
2648 #endif
2649
2650         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2651                                 NULL, //allocator
2652                                 szUniBuffer,
2653                                 nBufSize,
2654                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2655                             );
2656
2657         wxASSERT(theString);
2658
2659         //Note that CER puts a BOM when converting to unicode
2660         //so we  check and use getchars instead in that case
2661         if (m_encoding == kCFStringEncodingUnicode)
2662         {
2663             if (szOut != NULL)
2664                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2665
2666             nRealOutSize = CFStringGetLength(theString) + 1;
2667         }
2668         else
2669         {
2670             CFStringGetBytes(
2671                 theString,
2672                 CFRangeMake(0, CFStringGetLength(theString)),
2673                 m_encoding,
2674                 0, //what to put in characters that can't be converted -
2675                     //0 tells CFString to return NULL if it meets such a character
2676                 false, //not an external representation
2677                 (UInt8*) szOut,
2678                 nOutSize,
2679                 (CFIndex*) &nRealOutSize
2680                         );
2681         }
2682
2683         CFRelease(theString);
2684
2685 #if SIZEOF_WCHAR_T == 4
2686         delete[] szUniBuffer;
2687 #endif
2688
2689         return  nRealOutSize - 1;
2690     }
2691
2692     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2693
2694     bool IsOk() const
2695     {
2696         return m_encoding != kCFStringEncodingInvalidId &&
2697               CFStringIsEncodingAvailable(m_encoding);
2698     }
2699
2700 private:
2701     CFStringEncoding m_encoding ;
2702 };
2703
2704 #endif // defined(__WXCOCOA__)
2705
2706 // ============================================================================
2707 // Mac conversion classes
2708 // ============================================================================
2709
2710 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2711
2712 class wxMBConv_mac : public wxMBConv
2713 {
2714 public:
2715     wxMBConv_mac()
2716     {
2717         Init(CFStringGetSystemEncoding()) ;
2718     }
2719
2720     wxMBConv_mac(const wxMBConv_mac& conv)
2721     {
2722         Init(conv.m_char_encoding);
2723     }
2724
2725 #if wxUSE_FONTMAP
2726     wxMBConv_mac(const wxChar* name)
2727     {
2728         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2729     }
2730 #endif
2731
2732     wxMBConv_mac(wxFontEncoding encoding)
2733     {
2734         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2735     }
2736
2737     ~wxMBConv_mac()
2738     {
2739         OSStatus status = noErr ;
2740         status = TECDisposeConverter(m_MB2WC_converter);
2741         status = TECDisposeConverter(m_WC2MB_converter);
2742     }
2743
2744
2745     void Init( TextEncodingBase encoding)
2746     {
2747         OSStatus status = noErr ;
2748         m_char_encoding = encoding ;
2749         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2750
2751         status = TECCreateConverter(&m_MB2WC_converter,
2752                                     m_char_encoding,
2753                                     m_unicode_encoding);
2754         status = TECCreateConverter(&m_WC2MB_converter,
2755                                     m_unicode_encoding,
2756                                     m_char_encoding);
2757     }
2758
2759     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2760     {
2761         OSStatus status = noErr ;
2762         ByteCount byteOutLen ;
2763         ByteCount byteInLen = strlen(psz) + 1;
2764         wchar_t *tbuf = NULL ;
2765         UniChar* ubuf = NULL ;
2766         size_t res = 0 ;
2767
2768         if (buf == NULL)
2769         {
2770             // Apple specs say at least 32
2771             n = wxMax( 32, byteInLen ) ;
2772             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2773         }
2774
2775         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2776
2777 #if SIZEOF_WCHAR_T == 4
2778         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2779 #else
2780         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2781 #endif
2782
2783         status = TECConvertText(
2784             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2785             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2786
2787 #if SIZEOF_WCHAR_T == 4
2788         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2789         // is not properly terminated we get random characters at the end
2790         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2791         wxMBConvUTF16 converter ;
2792         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2793         free( ubuf ) ;
2794 #else
2795         res = byteOutLen / sizeof( UniChar ) ;
2796 #endif
2797
2798         if ( buf == NULL )
2799              free(tbuf) ;
2800
2801         if ( buf  && res < n)
2802             buf[res] = 0;
2803
2804         return res ;
2805     }
2806
2807     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2808     {
2809         OSStatus status = noErr ;
2810         ByteCount byteOutLen ;
2811         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2812
2813         char *tbuf = NULL ;
2814
2815         if (buf == NULL)
2816         {
2817             // Apple specs say at least 32
2818             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2819             tbuf = (char*) malloc( n ) ;
2820         }
2821
2822         ByteCount byteBufferLen = n ;
2823         UniChar* ubuf = NULL ;
2824
2825 #if SIZEOF_WCHAR_T == 4
2826         wxMBConvUTF16 converter ;
2827         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2828         byteInLen = unicharlen ;
2829         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2830         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2831 #else
2832         ubuf = (UniChar*) psz ;
2833 #endif
2834
2835         status = TECConvertText(
2836             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2837             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2838
2839 #if SIZEOF_WCHAR_T == 4
2840         free( ubuf ) ;
2841 #endif
2842
2843         if ( buf == NULL )
2844             free(tbuf) ;
2845
2846         size_t res = byteOutLen ;
2847         if ( buf  && res < n)
2848         {
2849             buf[res] = 0;
2850
2851             //we need to double-trip to verify it didn't insert any ? in place
2852             //of bogus characters
2853             wxWCharBuffer wcBuf(n);
2854             size_t pszlen = wxWcslen(psz);
2855             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2856                         wxWcslen(wcBuf) != pszlen ||
2857                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2858             {
2859                 // we didn't obtain the same thing we started from, hence
2860                 // the conversion was lossy and we consider that it failed
2861                 return wxCONV_FAILED;
2862             }
2863         }
2864
2865         return res ;
2866     }
2867
2868     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2869
2870     bool IsOk() const
2871         { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2872
2873 private:
2874     TECObjectRef m_MB2WC_converter;
2875     TECObjectRef m_WC2MB_converter;
2876
2877     TextEncodingBase m_char_encoding;
2878     TextEncodingBase m_unicode_encoding;
2879 };
2880
2881 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2882
2883 // ============================================================================
2884 // wxEncodingConverter based conversion classes
2885 // ============================================================================
2886
2887 #if wxUSE_FONTMAP
2888
2889 class wxMBConv_wxwin : public wxMBConv
2890 {
2891 private:
2892     void Init()
2893     {
2894         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2895                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2896     }
2897
2898 public:
2899     // temporarily just use wxEncodingConverter stuff,
2900     // so that it works while a better implementation is built
2901     wxMBConv_wxwin(const wxChar* name)
2902     {
2903         if (name)
2904             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2905         else
2906             m_enc = wxFONTENCODING_SYSTEM;
2907
2908         Init();
2909     }
2910
2911     wxMBConv_wxwin(wxFontEncoding enc)
2912     {
2913         m_enc = enc;
2914
2915         Init();
2916     }
2917
2918     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2919     {
2920         size_t inbuf = strlen(psz);
2921         if (buf)
2922         {
2923             if (!m2w.Convert(psz, buf))
2924                 return wxCONV_FAILED;
2925         }
2926         return inbuf;
2927     }
2928
2929     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2930     {
2931         const size_t inbuf = wxWcslen(psz);
2932         if (buf)
2933         {
2934             if (!w2m.Convert(psz, buf))
2935                 return wxCONV_FAILED;
2936         }
2937
2938         return inbuf;
2939     }
2940
2941     virtual size_t GetMBNulLen() const
2942     {
2943         switch ( m_enc )
2944         {
2945             case wxFONTENCODING_UTF16BE:
2946             case wxFONTENCODING_UTF16LE:
2947                 return 2;
2948
2949             case wxFONTENCODING_UTF32BE:
2950             case wxFONTENCODING_UTF32LE:
2951                 return 4;
2952
2953             default:
2954                 return 1;
2955         }
2956     }
2957
2958     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2959
2960     bool IsOk() const { return m_ok; }
2961
2962 public:
2963     wxFontEncoding m_enc;
2964     wxEncodingConverter m2w, w2m;
2965
2966 private:
2967     // were we initialized successfully?
2968     bool m_ok;
2969
2970     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2971 };
2972
2973 // make the constructors available for unit testing
2974 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2975 {
2976     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2977     if ( !result->IsOk() )
2978     {
2979         delete result;
2980         return 0;
2981     }
2982
2983     return result;
2984 }
2985
2986 #endif // wxUSE_FONTMAP
2987
2988 // ============================================================================
2989 // wxCSConv implementation
2990 // ============================================================================
2991
2992 void wxCSConv::Init()
2993 {
2994     m_name = NULL;
2995     m_convReal =  NULL;
2996     m_deferred = true;
2997 }
2998
2999 wxCSConv::wxCSConv(const wxChar *charset)
3000 {
3001     Init();
3002
3003     if ( charset )
3004     {
3005         SetName(charset);
3006     }
3007
3008 #if wxUSE_FONTMAP
3009     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3010 #else
3011     m_encoding = wxFONTENCODING_SYSTEM;
3012 #endif
3013 }
3014
3015 wxCSConv::wxCSConv(wxFontEncoding encoding)
3016 {
3017     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3018     {
3019         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3020
3021         encoding = wxFONTENCODING_SYSTEM;
3022     }
3023
3024     Init();
3025
3026     m_encoding = encoding;
3027 }
3028
3029 wxCSConv::~wxCSConv()
3030 {
3031     Clear();
3032 }
3033
3034 wxCSConv::wxCSConv(const wxCSConv& conv)
3035         : wxMBConv()
3036 {
3037     Init();
3038
3039     SetName(conv.m_name);
3040     m_encoding = conv.m_encoding;
3041 }
3042
3043 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3044 {
3045     Clear();
3046
3047     SetName(conv.m_name);
3048     m_encoding = conv.m_encoding;
3049
3050     return *this;
3051 }
3052
3053 void wxCSConv::Clear()
3054 {
3055     free(m_name);
3056     delete m_convReal;
3057
3058     m_name = NULL;
3059     m_convReal = NULL;
3060 }
3061
3062 void wxCSConv::SetName(const wxChar *charset)
3063 {
3064     if (charset)
3065     {
3066         m_name = wxStrdup(charset);
3067         m_deferred = true;
3068     }
3069 }
3070
3071 #if wxUSE_FONTMAP
3072 #include "wx/hashmap.h"
3073
3074 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3075                      wxEncodingNameCache );
3076
3077 static wxEncodingNameCache gs_nameCache;
3078 #endif
3079
3080 wxMBConv *wxCSConv::DoCreate() const
3081 {
3082 #if wxUSE_FONTMAP
3083     wxLogTrace(TRACE_STRCONV,
3084                wxT("creating conversion for %s"),
3085                (m_name ? m_name
3086                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3087 #endif // wxUSE_FONTMAP
3088
3089     // check for the special case of ASCII or ISO8859-1 charset: as we have
3090     // special knowledge of it anyhow, we don't need to create a special
3091     // conversion object
3092     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3093             m_encoding == wxFONTENCODING_DEFAULT )
3094     {
3095         // don't convert at all
3096         return NULL;
3097     }
3098
3099     // we trust OS to do conversion better than we can so try external
3100     // conversion methods first
3101     //
3102     // the full order is:
3103     //      1. OS conversion (iconv() under Unix or Win32 API)
3104     //      2. hard coded conversions for UTF
3105     //      3. wxEncodingConverter as fall back
3106
3107     // step (1)
3108 #ifdef HAVE_ICONV
3109 #if !wxUSE_FONTMAP
3110     if ( m_name )
3111 #endif // !wxUSE_FONTMAP
3112     {
3113         wxString name(m_name);
3114         wxFontEncoding encoding(m_encoding);
3115
3116         if ( !name.empty() )
3117         {
3118             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3119             if ( conv->IsOk() )
3120                 return conv;
3121
3122             delete conv;
3123
3124 #if wxUSE_FONTMAP
3125             encoding =
3126                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3127 #endif // wxUSE_FONTMAP
3128         }
3129 #if wxUSE_FONTMAP
3130         {
3131             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3132             if ( it != gs_nameCache.end() )
3133             {
3134                 if ( it->second.empty() )
3135                     return NULL;
3136
3137                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3138                 if ( conv->IsOk() )
3139                     return conv;
3140
3141                 delete conv;
3142             }
3143
3144             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3145
3146             for ( ; *names; ++names )
3147             {
3148                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3149                 if ( conv->IsOk() )
3150                 {
3151                     gs_nameCache[encoding] = *names;
3152                     return conv;
3153                 }
3154
3155                 delete conv;
3156             }
3157
3158             gs_nameCache[encoding] = _T(""); // cache the failure
3159         }
3160 #endif // wxUSE_FONTMAP
3161     }
3162 #endif // HAVE_ICONV
3163
3164 #ifdef wxHAVE_WIN32_MB2WC
3165     {
3166 #if wxUSE_FONTMAP
3167         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3168                                       : new wxMBConv_win32(m_encoding);
3169         if ( conv->IsOk() )
3170             return conv;
3171
3172         delete conv;
3173 #else
3174         return NULL;
3175 #endif
3176     }
3177 #endif // wxHAVE_WIN32_MB2WC
3178
3179 #if defined(__WXMAC__)
3180     {
3181         // leave UTF16 and UTF32 to the built-ins of wx
3182         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3183             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3184         {
3185 #if wxUSE_FONTMAP
3186             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3187                                         : new wxMBConv_mac(m_encoding);
3188 #else
3189             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3190 #endif
3191             if ( conv->IsOk() )
3192                  return conv;
3193
3194             delete conv;
3195         }
3196     }
3197 #endif
3198
3199 #if defined(__WXCOCOA__)
3200     {
3201         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3202         {
3203 #if wxUSE_FONTMAP
3204             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3205                                           : new wxMBConv_cocoa(m_encoding);
3206 #else
3207             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3208 #endif
3209
3210             if ( conv->IsOk() )
3211                  return conv;
3212
3213             delete conv;
3214         }
3215     }
3216 #endif
3217     // step (2)
3218     wxFontEncoding enc = m_encoding;
3219 #if wxUSE_FONTMAP
3220     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3221     {
3222         // use "false" to suppress interactive dialogs -- we can be called from
3223         // anywhere and popping up a dialog from here is the last thing we want to
3224         // do
3225         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3226     }
3227 #endif // wxUSE_FONTMAP
3228
3229     switch ( enc )
3230     {
3231         case wxFONTENCODING_UTF7:
3232              return new wxMBConvUTF7;
3233
3234         case wxFONTENCODING_UTF8:
3235              return new wxMBConvUTF8;
3236
3237         case wxFONTENCODING_UTF16BE:
3238              return new wxMBConvUTF16BE;
3239
3240         case wxFONTENCODING_UTF16LE:
3241              return new wxMBConvUTF16LE;
3242
3243         case wxFONTENCODING_UTF32BE:
3244              return new wxMBConvUTF32BE;
3245
3246         case wxFONTENCODING_UTF32LE:
3247              return new wxMBConvUTF32LE;
3248
3249         default:
3250              // nothing to do but put here to suppress gcc warnings
3251              break;
3252     }
3253
3254     // step (3)
3255 #if wxUSE_FONTMAP
3256     {
3257         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3258                                       : new wxMBConv_wxwin(m_encoding);
3259         if ( conv->IsOk() )
3260             return conv;
3261
3262         delete conv;
3263     }
3264 #endif // wxUSE_FONTMAP
3265
3266     // NB: This is a hack to prevent deadlock. What could otherwise happen
3267     //     in Unicode build: wxConvLocal creation ends up being here
3268     //     because of some failure and logs the error. But wxLog will try to
3269     //     attach timestamp, for which it will need wxConvLocal (to convert
3270     //     time to char* and then wchar_t*), but that fails, tries to log
3271     //     error, but wxLog has a (already locked) critical section that
3272     //     guards static buffer.
3273     static bool alreadyLoggingError = false;
3274     if (!alreadyLoggingError)
3275     {
3276         alreadyLoggingError = true;
3277         wxLogError(_("Cannot convert from the charset '%s'!"),
3278                    m_name ? m_name
3279                       :
3280 #if wxUSE_FONTMAP
3281                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3282 #else // !wxUSE_FONTMAP
3283                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3284 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3285               );
3286
3287         alreadyLoggingError = false;
3288     }
3289
3290     return NULL;
3291 }
3292
3293 void wxCSConv::CreateConvIfNeeded() const
3294 {
3295     if ( m_deferred )
3296     {
3297         wxCSConv *self = (wxCSConv *)this; // const_cast
3298
3299 #if wxUSE_INTL
3300         // if we don't have neither the name nor the encoding, use the default
3301         // encoding for this system
3302         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3303         {
3304             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3305         }
3306 #endif // wxUSE_INTL
3307
3308         self->m_convReal = DoCreate();
3309         self->m_deferred = false;
3310     }
3311 }
3312
3313 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3314 {
3315     CreateConvIfNeeded();
3316
3317     if (m_convReal)
3318         return m_convReal->MB2WC(buf, psz, n);
3319
3320     // latin-1 (direct)
3321     size_t len = strlen(psz);
3322
3323     if (buf)
3324     {
3325         for (size_t c = 0; c <= len; c++)
3326             buf[c] = (unsigned char)(psz[c]);
3327     }
3328
3329     return len;
3330 }
3331
3332 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3333 {
3334     CreateConvIfNeeded();
3335
3336     if (m_convReal)
3337         return m_convReal->WC2MB(buf, psz, n);
3338
3339     // latin-1 (direct)
3340     const size_t len = wxWcslen(psz);
3341     if (buf)
3342     {
3343         for (size_t c = 0; c <= len; c++)
3344         {
3345             if (psz[c] > 0xFF)
3346                 return wxCONV_FAILED;
3347
3348             buf[c] = (char)psz[c];
3349         }
3350     }
3351     else
3352     {
3353         for (size_t c = 0; c <= len; c++)
3354         {
3355             if (psz[c] > 0xFF)
3356                 return wxCONV_FAILED;
3357         }
3358     }
3359
3360     return len;
3361 }
3362
3363 size_t wxCSConv::GetMBNulLen() const
3364 {
3365     CreateConvIfNeeded();
3366
3367     if ( m_convReal )
3368     {
3369         return m_convReal->GetMBNulLen();
3370     }
3371
3372     return 1;
3373 }
3374
3375 // ----------------------------------------------------------------------------
3376 // globals
3377 // ----------------------------------------------------------------------------
3378
3379 #ifdef __WINDOWS__
3380     static wxMBConv_win32 wxConvLibcObj;
3381 #elif defined(__WXMAC__) && !defined(__MACH__)
3382     static wxMBConv_mac wxConvLibcObj ;
3383 #else
3384     static wxMBConvLibc wxConvLibcObj;
3385 #endif
3386
3387 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3388 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3389 static wxMBConvUTF7 wxConvUTF7Obj;
3390 static wxMBConvUTF8 wxConvUTF8Obj;
3391
3392 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3393 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3394 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3395 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3396 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3397 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3398 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3400 #ifdef __WXOSX__
3401                                     wxConvUTF8Obj;
3402 #else
3403                                     wxConvLibcObj;
3404 #endif
3405
3406 #else // !wxUSE_WCHAR_T
3407
3408 // stand-ins in absence of wchar_t
3409 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3410                                 wxConvISO8859_1,
3411                                 wxConvLocal,
3412                                 wxConvUTF8;
3413
3414 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T