src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/intl.h"
  20     #include "wx/log.h"
  21     #include "wx/utils.h"
  22 #endif
  23
  24 #include "wx/strconv.h"
  25
  26 #if wxUSE_WCHAR_T
  27
  28 #ifdef __WINDOWS__
  29     #include "wx/msw/private.h"
  30     #include "wx/msw/missing.h"
  31 #endif
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #define wxHAVE_WIN32_MB2WC
  43 #endif
  44
  45 #ifdef __SALFORDC__
  46     #include <clib.h>
  47 #endif
  48
  49 #ifdef HAVE_ICONV
  50     #include <iconv.h>
  51     #include "wx/thread.h"
  52 #endif
  53
  54 #include "wx/encconv.h"
  55 #include "wx/fontmap.h"
  56
  57 #ifdef __WXMAC__
  58 #ifndef __DARWIN__
  59 #include <ATSUnicode.h>
  60 #include <TextCommon.h>
  61 #include <TextEncodingConverter.h>
  62 #endif
  63
  64 // includes Mac headers
  65 #include "wx/mac/private.h"
  66 #endif
  67
  68
  69 #define TRACE_STRCONV _T("strconv")
  70
  71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  72 // be 4 bytes
  73 #if SIZEOF_WCHAR_T == 2
  74     #define WC_UTF16
  75 #endif
  76
  77
  78 // ============================================================================
  79 // implementation
  80 // ============================================================================
  81
  82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  83 static bool NotAllNULs(const char *p, size_t n)
  84 {
  85     while ( n && *p++ == '\0' )
  86         n--;
  87
  88     return n != 0;
  89 }
  90
  91 // ----------------------------------------------------------------------------
  92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  93 // ----------------------------------------------------------------------------
  94
  95 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  96 {
  97     if (input <= 0xffff)
  98     {
  99         if (output)
 100             *output = (wxUint16) input;
 101
 102         return 1;
 103     }
 104     else if (input >= 0x110000)
 105     {
 106         return wxCONV_FAILED;
 107     }
 108     else
 109     {
 110         if (output)
 111         {
 112             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 113             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 114         }
 115
 116         return 2;
 117     }
 118 }
 119
 120 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 121 {
 122     if ((*input < 0xd800) || (*input > 0xdfff))
 123     {
 124         output = *input;
 125         return 1;
 126     }
 127     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 128     {
 129         output = *input;
 130         return wxCONV_FAILED;
 131     }
 132     else
 133     {
 134         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 135         return 2;
 136     }
 137 }
 138
 139 #ifdef WC_UTF16
 140     typedef wchar_t wxDecodeSurrogate_t;
 141 #else // !WC_UTF16
 142     typedef wxUint16 wxDecodeSurrogate_t;
 143 #endif // WC_UTF16/!WC_UTF16
 144
 145 // returns the next UTF-32 character from the wchar_t buffer and advances the
 146 // pointer to the character after this one
 147 //
 148 // if an invalid character is found, *pSrc is set to NULL, the caller must
 149 // check for this
 150 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 151 {
 152     wxUint32 out;
 153     const size_t
 154         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 155     if ( n == wxCONV_FAILED )
 156         *pSrc = NULL;
 157     else
 158         *pSrc += n;
 159
 160     return out;
 161 }
 162
 163 // ----------------------------------------------------------------------------
 164 // wxMBConv
 165 // ----------------------------------------------------------------------------
 166
 167 size_t
 168 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 169                   const char *src, size_t srcLen) const
 170 {
 171     // although new conversion classes are supposed to implement this function
 172     // directly, the existins ones only implement the old MB2WC() and so, to
 173     // avoid to have to rewrite all conversion classes at once, we provide a
 174     // default (but not efficient) implementation of this one in terms of the
 175     // old function by copying the input to ensure that it's NUL-terminated and
 176     // then using MB2WC() to convert it
 177
 178     // the number of chars [which would be] written to dst [if it were not NULL]
 179     size_t dstWritten = 0;
 180
 181     // the number of NULs terminating this string
 182     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 183
 184     // if we were not given the input size we just have to assume that the
 185     // string is properly terminated as we have no way of knowing how long it
 186     // is anyhow, but if we do have the size check whether there are enough
 187     // NULs at the end
 188     wxCharBuffer bufTmp;
 189     const char *srcEnd;
 190     if ( srcLen != wxNO_LEN )
 191     {
 192         // we need to know how to find the end of this string
 193         nulLen = GetMBNulLen();
 194         if ( nulLen == wxCONV_FAILED )
 195             return wxCONV_FAILED;
 196
 197         // if there are enough NULs we can avoid the copy
 198         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 199         {
 200             // make a copy in order to properly NUL-terminate the string
 201             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 202             char * const p = bufTmp.data();
 203             memcpy(p, src, srcLen);
 204             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 205                 *s = '\0';
 206
 207             src = bufTmp;
 208         }
 209
 210         srcEnd = src + srcLen;
 211     }
 212     else // quit after the first loop iteration
 213     {
 214         srcEnd = NULL;
 215     }
 216
 217     for ( ;; )
 218     {
 219         // try to convert the current chunk
 220         size_t lenChunk = MB2WC(NULL, src, 0);
 221         if ( lenChunk == wxCONV_FAILED )
 222             return wxCONV_FAILED;
 223
 224         lenChunk++; // for the L'\0' at the end of this chunk
 225
 226         dstWritten += lenChunk;
 227
 228         if ( lenChunk == 1 )
 229         {
 230             // nothing left in the input string, conversion succeeded
 231             break;
 232         }
 233
 234         if ( dst )
 235         {
 236             if ( dstWritten > dstLen )
 237                 return wxCONV_FAILED;
 238
 239             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 240                 return wxCONV_FAILED;
 241
 242             dst += lenChunk;
 243         }
 244
 245         if ( !srcEnd )
 246         {
 247             // we convert just one chunk in this case as this is the entire
 248             // string anyhow
 249             break;
 250         }
 251
 252         // advance the input pointer past the end of this chunk
 253         while ( NotAllNULs(src, nulLen) )
 254         {
 255             // notice that we must skip over multiple bytes here as we suppose
 256             // that if NUL takes 2 or 4 bytes, then all the other characters do
 257             // too and so if advanced by a single byte we might erroneously
 258             // detect sequences of NUL bytes in the middle of the input
 259             src += nulLen;
 260         }
 261
 262         src += nulLen; // skipping over its terminator as well
 263
 264         // note that ">=" (and not just "==") is needed here as the terminator
 265         // we skipped just above could be inside or just after the buffer
 266         // delimited by inEnd
 267         if ( src >= srcEnd )
 268             break;
 269     }
 270
 271     return dstWritten;
 272 }
 273
 274 size_t
 275 wxMBConv::FromWChar(char *dst, size_t dstLen,
 276                     const wchar_t *src, size_t srcLen) const
 277 {
 278     // the number of chars [which would be] written to dst [if it were not NULL]
 279     size_t dstWritten = 0;
 280
 281     // make a copy of the input string unless it is already properly
 282     // NUL-terminated
 283     //
 284     // if we don't know its length we have no choice but to assume that it is,
 285     // indeed, properly terminated
 286     wxWCharBuffer bufTmp;
 287     if ( srcLen == wxNO_LEN )
 288     {
 289         srcLen = wxWcslen(src) + 1;
 290     }
 291     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 292     {
 293         // make a copy in order to properly NUL-terminate the string
 294         bufTmp = wxWCharBuffer(srcLen);
 295         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 296         src = bufTmp;
 297     }
 298
 299     const size_t lenNul = GetMBNulLen();
 300     for ( const wchar_t * const srcEnd = src + srcLen;
 301           src < srcEnd;
 302           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 303     {
 304         // try to convert the current chunk
 305         size_t lenChunk = WC2MB(NULL, src, 0);
 306
 307         if ( lenChunk == wxCONV_FAILED )
 308             return wxCONV_FAILED;
 309
 310         lenChunk += lenNul;
 311         dstWritten += lenChunk;
 312
 313         if ( dst )
 314         {
 315             if ( dstWritten > dstLen )
 316                 return wxCONV_FAILED;
 317
 318             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 319                 return wxCONV_FAILED;
 320
 321             dst += lenChunk;
 322         }
 323     }
 324
 325     return dstWritten;
 326 }
 327
 328 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 329 {
 330     size_t rc = ToWChar(outBuff, outLen, inBuff);
 331     if ( rc != wxCONV_FAILED )
 332     {
 333         // ToWChar() returns the buffer length, i.e. including the trailing
 334         // NUL, while this method doesn't take it into account
 335         rc--;
 336     }
 337
 338     return rc;
 339 }
 340
 341 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 342 {
 343     size_t rc = FromWChar(outBuff, outLen, inBuff);
 344     if ( rc != wxCONV_FAILED )
 345     {
 346         rc -= GetMBNulLen();
 347     }
 348
 349     return rc;
 350 }
 351
 352 wxMBConv::~wxMBConv()
 353 {
 354     // nothing to do here (necessary for Darwin linking probably)
 355 }
 356
 357 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 358 {
 359     if ( psz )
 360     {
 361         // calculate the length of the buffer needed first
 362         const size_t nLen = MB2WC(NULL, psz, 0);
 363         if ( nLen != wxCONV_FAILED )
 364         {
 365             // now do the actual conversion
 366             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 367
 368             // +1 for the trailing NULL
 369             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 370                 return buf;
 371         }
 372     }
 373
 374     return wxWCharBuffer();
 375 }
 376
 377 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 378 {
 379     if ( pwz )
 380     {
 381         const size_t nLen = WC2MB(NULL, pwz, 0);
 382         if ( nLen != wxCONV_FAILED )
 383         {
 384             // extra space for trailing NUL(s)
 385             static const size_t extraLen = GetMaxMBNulLen();
 386
 387             wxCharBuffer buf(nLen + extraLen - 1);
 388             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 389                 return buf;
 390         }
 391     }
 392
 393     return wxCharBuffer();
 394 }
 395
 396 const wxWCharBuffer
 397 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 398 {
 399     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 400     if ( dstLen != wxCONV_FAILED )
 401     {
 402         wxWCharBuffer wbuf(dstLen - 1);
 403         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 404         {
 405             if ( outLen )
 406             {
 407                 *outLen = dstLen;
 408                 if ( wbuf[dstLen - 1] == L'\0' )
 409                     (*outLen)--;
 410             }
 411
 412             return wbuf;
 413         }
 414     }
 415
 416     if ( outLen )
 417         *outLen = 0;
 418
 419     return wxWCharBuffer();
 420 }
 421
 422 const wxCharBuffer
 423 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 424 {
 425     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 426     if ( dstLen != wxCONV_FAILED )
 427     {
 428         // special case of empty input: can't allocate 0 size buffer below as
 429         // wxCharBuffer insists on NUL-terminating it
 430         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 431         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 432         {
 433             if ( outLen )
 434             {
 435                 *outLen = dstLen;
 436
 437                 const size_t nulLen = GetMBNulLen();
 438                 if ( dstLen >= nulLen &&
 439                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 440                 {
 441                     // in this case the output is NUL-terminated and we're not
 442                     // supposed to count NUL
 443                     *outLen -= nulLen;
 444                 }
 445             }
 446
 447             return buf;
 448         }
 449     }
 450
 451     if ( outLen )
 452         *outLen = 0;
 453
 454     return wxCharBuffer();
 455 }
 456
 457 // ----------------------------------------------------------------------------
 458 // wxMBConvLibc
 459 // ----------------------------------------------------------------------------
 460
 461 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 462 {
 463     return wxMB2WC(buf, psz, n);
 464 }
 465
 466 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 467 {
 468     return wxWC2MB(buf, psz, n);
 469 }
 470
 471 // ----------------------------------------------------------------------------
 472 // wxConvBrokenFileNames
 473 // ----------------------------------------------------------------------------
 474
 475 #ifdef __UNIX__
 476
 477 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 478 {
 479     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 480                   || wxStricmp(charset, _T("UTF8")) == 0  )
 481         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 482     else
 483         m_conv = new wxCSConv(charset);
 484 }
 485
 486 #endif // __UNIX__
 487
 488 // ----------------------------------------------------------------------------
 489 // UTF-7
 490 // ----------------------------------------------------------------------------
 491
 492 // Implementation (C) 2004 Fredrik Roubert
 493
 494 //
 495 // BASE64 decoding table
 496 //
 497 static const unsigned char utf7unb64[] =
 498 {
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 505     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 506     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 508     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 509     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 510     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 512     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 513     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 514     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 531 };
 532
 533 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 534 {
 535     size_t len = 0;
 536
 537     while ( *psz && (!buf || (len < n)) )
 538     {
 539         unsigned char cc = *psz++;
 540         if (cc != '+')
 541         {
 542             // plain ASCII char
 543             if (buf)
 544                 *buf++ = cc;
 545             len++;
 546         }
 547         else if (*psz == '-')
 548         {
 549             // encoded plus sign
 550             if (buf)
 551                 *buf++ = cc;
 552             len++;
 553             psz++;
 554         }
 555         else // start of BASE64 encoded string
 556         {
 557             bool lsb, ok;
 558             unsigned int d, l;
 559             for ( ok = lsb = false, d = 0, l = 0;
 560                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 561                   psz++ )
 562             {
 563                 d <<= 6;
 564                 d += cc;
 565                 for (l += 6; l >= 8; lsb = !lsb)
 566                 {
 567                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 568                     if (lsb)
 569                     {
 570                         if (buf)
 571                             *buf++ |= c;
 572                         len ++;
 573                     }
 574                     else
 575                     {
 576                         if (buf)
 577                             *buf = (wchar_t)(c << 8);
 578                     }
 579
 580                     ok = true;
 581                 }
 582             }
 583
 584             if ( !ok )
 585             {
 586                 // in valid UTF7 we should have valid characters after '+'
 587                 return wxCONV_FAILED;
 588             }
 589
 590             if (*psz == '-')
 591                 psz++;
 592         }
 593     }
 594
 595     if ( buf && (len < n) )
 596         *buf = '\0';
 597
 598     return len;
 599 }
 600
 601 //
 602 // BASE64 encoding table
 603 //
 604 static const unsigned char utf7enb64[] =
 605 {
 606     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 607     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 608     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 609     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 610     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 611     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 612     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 613     '4', '5', '6', '7', '8', '9', '+', '/'
 614 };
 615
 616 //
 617 // UTF-7 encoding table
 618 //
 619 // 0 - Set D (directly encoded characters)
 620 // 1 - Set O (optional direct characters)
 621 // 2 - whitespace characters (optional)
 622 // 3 - special characters
 623 //
 624 static const unsigned char utf7encode[128] =
 625 {
 626     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 627     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 628     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 629     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 630     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 631     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 632     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 633     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 634 };
 635
 636 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 637 {
 638     size_t len = 0;
 639
 640     while (*psz && ((!buf) || (len < n)))
 641     {
 642         wchar_t cc = *psz++;
 643         if (cc < 0x80 && utf7encode[cc] < 1)
 644         {
 645             // plain ASCII char
 646             if (buf)
 647                 *buf++ = (char)cc;
 648
 649             len++;
 650         }
 651 #ifndef WC_UTF16
 652         else if (((wxUint32)cc) > 0xffff)
 653         {
 654             // no surrogate pair generation (yet?)
 655             return wxCONV_FAILED;
 656         }
 657 #endif
 658         else
 659         {
 660             if (buf)
 661                 *buf++ = '+';
 662
 663             len++;
 664             if (cc != '+')
 665             {
 666                 // BASE64 encode string
 667                 unsigned int lsb, d, l;
 668                 for (d = 0, l = 0; /*nothing*/; psz++)
 669                 {
 670                     for (lsb = 0; lsb < 2; lsb ++)
 671                     {
 672                         d <<= 8;
 673                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 674
 675                         for (l += 8; l >= 6; )
 676                         {
 677                             l -= 6;
 678                             if (buf)
 679                                 *buf++ = utf7enb64[(d >> l) % 64];
 680                             len++;
 681                         }
 682                     }
 683
 684                     cc = *psz;
 685                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 686                         break;
 687                 }
 688
 689                 if (l != 0)
 690                 {
 691                     if (buf)
 692                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 693
 694                     len++;
 695                 }
 696             }
 697
 698             if (buf)
 699                 *buf++ = '-';
 700             len++;
 701         }
 702     }
 703
 704     if (buf && (len < n))
 705         *buf = 0;
 706
 707     return len;
 708 }
 709
 710 // ----------------------------------------------------------------------------
 711 // UTF-8
 712 // ----------------------------------------------------------------------------
 713
 714 static wxUint32 utf8_max[]=
 715     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 716
 717 // boundaries of the private use area we use to (temporarily) remap invalid
 718 // characters invalid in a UTF-8 encoded string
 719 const wxUint32 wxUnicodePUA = 0x100000;
 720 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 721
 722 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 723 {
 724     size_t len = 0;
 725
 726     while (*psz && ((!buf) || (len < n)))
 727     {
 728         const char *opsz = psz;
 729         bool invalid = false;
 730         unsigned char cc = *psz++, fc = cc;
 731         unsigned cnt;
 732         for (cnt = 0; fc & 0x80; cnt++)
 733             fc <<= 1;
 734
 735         if (!cnt)
 736         {
 737             // plain ASCII char
 738             if (buf)
 739                 *buf++ = cc;
 740             len++;
 741
 742             // escape the escape character for octal escapes
 743             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 744                     && cc == '\\' && (!buf || len < n))
 745             {
 746                 if (buf)
 747                     *buf++ = cc;
 748                 len++;
 749             }
 750         }
 751         else
 752         {
 753             cnt--;
 754             if (!cnt)
 755             {
 756                 // invalid UTF-8 sequence
 757                 invalid = true;
 758             }
 759             else
 760             {
 761                 unsigned ocnt = cnt - 1;
 762                 wxUint32 res = cc & (0x3f >> cnt);
 763                 while (cnt--)
 764                 {
 765                     cc = *psz;
 766                     if ((cc & 0xC0) != 0x80)
 767                     {
 768                         // invalid UTF-8 sequence
 769                         invalid = true;
 770                         break;
 771                     }
 772
 773                     psz++;
 774                     res = (res << 6) | (cc & 0x3f);
 775                 }
 776
 777                 if (invalid || res <= utf8_max[ocnt])
 778                 {
 779                     // illegal UTF-8 encoding
 780                     invalid = true;
 781                 }
 782                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 783                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 784                 {
 785                     // if one of our PUA characters turns up externally
 786                     // it must also be treated as an illegal sequence
 787                     // (a bit like you have to escape an escape character)
 788                     invalid = true;
 789                 }
 790                 else
 791                 {
 792 #ifdef WC_UTF16
 793                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 794                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 795                     if (pa == wxCONV_FAILED)
 796                     {
 797                         invalid = true;
 798                     }
 799                     else
 800                     {
 801                         if (buf)
 802                             buf += pa;
 803                         len += pa;
 804                     }
 805 #else // !WC_UTF16
 806                     if (buf)
 807                         *buf++ = (wchar_t)res;
 808                     len++;
 809 #endif // WC_UTF16/!WC_UTF16
 810                 }
 811             }
 812
 813             if (invalid)
 814             {
 815                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 816                 {
 817                     while (opsz < psz && (!buf || len < n))
 818                     {
 819 #ifdef WC_UTF16
 820                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 821                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 822                         wxASSERT(pa != wxCONV_FAILED);
 823                         if (buf)
 824                             buf += pa;
 825                         opsz++;
 826                         len += pa;
 827 #else
 828                         if (buf)
 829                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 830                         opsz++;
 831                         len++;
 832 #endif
 833                     }
 834                 }
 835                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 836                 {
 837                     while (opsz < psz && (!buf || len < n))
 838                     {
 839                         if ( buf && len + 3 < n )
 840                         {
 841                             unsigned char on = *opsz;
 842                             *buf++ = L'\\';
 843                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 844                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 845                             *buf++ = (wchar_t)( L'0' + on % 010 );
 846                         }
 847
 848                         opsz++;
 849                         len += 4;
 850                     }
 851                 }
 852                 else // MAP_INVALID_UTF8_NOT
 853                 {
 854                     return wxCONV_FAILED;
 855                 }
 856             }
 857         }
 858     }
 859
 860     if (buf && (len < n))
 861         *buf = 0;
 862
 863     return len;
 864 }
 865
 866 static inline bool isoctal(wchar_t wch)
 867 {
 868     return L'0' <= wch && wch <= L'7';
 869 }
 870
 871 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 872 {
 873     size_t len = 0;
 874
 875     while (*psz && ((!buf) || (len < n)))
 876     {
 877         wxUint32 cc;
 878
 879 #ifdef WC_UTF16
 880         // cast is ok for WC_UTF16
 881         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 882         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 883 #else
 884         cc = (*psz++) & 0x7fffffff;
 885 #endif
 886
 887         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 888                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 889         {
 890             if (buf)
 891                 *buf++ = (char)(cc - wxUnicodePUA);
 892             len++;
 893         }
 894         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 895                     && cc == L'\\' && psz[0] == L'\\' )
 896         {
 897             if (buf)
 898                 *buf++ = (char)cc;
 899             psz++;
 900             len++;
 901         }
 902         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 903                     cc == L'\\' &&
 904                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 905         {
 906             if (buf)
 907             {
 908                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 909                                  (psz[1] - L'0') * 010 +
 910                                  (psz[2] - L'0'));
 911             }
 912
 913             psz += 3;
 914             len++;
 915         }
 916         else
 917         {
 918             unsigned cnt;
 919             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 920             {
 921             }
 922
 923             if (!cnt)
 924             {
 925                 // plain ASCII char
 926                 if (buf)
 927                     *buf++ = (char) cc;
 928                 len++;
 929             }
 930             else
 931             {
 932                 len += cnt + 1;
 933                 if (buf)
 934                 {
 935                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 936                     while (cnt--)
 937                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 938                 }
 939             }
 940         }
 941     }
 942
 943     if (buf && (len < n))
 944         *buf = 0;
 945
 946     return len;
 947 }
 948
 949 // ============================================================================
 950 // UTF-16
 951 // ============================================================================
 952
 953 #ifdef WORDS_BIGENDIAN
 954     #define wxMBConvUTF16straight wxMBConvUTF16BE
 955     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 956 #else
 957     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 958     #define wxMBConvUTF16straight wxMBConvUTF16LE
 959 #endif
 960
 961 /* static */
 962 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 963 {
 964     if ( srcLen == wxNO_LEN )
 965     {
 966         // count the number of bytes in input, including the trailing NULs
 967         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 968         for ( srcLen = 1; *inBuff++; srcLen++ )
 969             ;
 970
 971         srcLen *= BYTES_PER_CHAR;
 972     }
 973     else // we already have the length
 974     {
 975         // we can only convert an entire number of UTF-16 characters
 976         if ( srcLen % BYTES_PER_CHAR )
 977             return wxCONV_FAILED;
 978     }
 979
 980     return srcLen;
 981 }
 982
 983 // case when in-memory representation is UTF-16 too
 984 #ifdef WC_UTF16
 985
 986 // ----------------------------------------------------------------------------
 987 // conversions without endianness change
 988 // ----------------------------------------------------------------------------
 989
 990 size_t
 991 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 992                                const char *src, size_t srcLen) const
 993 {
 994     // set up the scene for using memcpy() (which is presumably more efficient
 995     // than copying the bytes one by one)
 996     srcLen = GetLength(src, srcLen);
 997     if ( srcLen == wxNO_LEN )
 998         return wxCONV_FAILED;
 999
1000     const size_t inLen = srcLen / BYTES_PER_CHAR;
1001     if ( dst )
1002     {
1003         if ( dstLen < inLen )
1004             return wxCONV_FAILED;
1005
1006         memcpy(dst, src, srcLen);
1007     }
1008
1009     return inLen;
1010 }
1011
1012 size_t
1013 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1014                                  const wchar_t *src, size_t srcLen) const
1015 {
1016     if ( srcLen == wxNO_LEN )
1017         srcLen = wxWcslen(src) + 1;
1018
1019     srcLen *= BYTES_PER_CHAR;
1020
1021     if ( dst )
1022     {
1023         if ( dstLen < srcLen )
1024             return wxCONV_FAILED;
1025
1026         memcpy(dst, src, srcLen);
1027     }
1028
1029     return srcLen;
1030 }
1031
1032 // ----------------------------------------------------------------------------
1033 // endian-reversing conversions
1034 // ----------------------------------------------------------------------------
1035
1036 size_t
1037 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1038                            const char *src, size_t srcLen) const
1039 {
1040     srcLen = GetLength(src, srcLen);
1041     if ( srcLen == wxNO_LEN )
1042         return wxCONV_FAILED;
1043
1044     srcLen /= BYTES_PER_CHAR;
1045
1046     if ( dst )
1047     {
1048         if ( dstLen < srcLen )
1049             return wxCONV_FAILED;
1050
1051         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1052         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1053         {
1054             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1055         }
1056     }
1057
1058     return srcLen;
1059 }
1060
1061 size_t
1062 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1063                              const wchar_t *src, size_t srcLen) const
1064 {
1065     if ( srcLen == wxNO_LEN )
1066         srcLen = wxWcslen(src) + 1;
1067
1068     srcLen *= BYTES_PER_CHAR;
1069
1070     if ( dst )
1071     {
1072         if ( dstLen < srcLen )
1073             return wxCONV_FAILED;
1074
1075         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1076         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1077         {
1078             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1079         }
1080     }
1081
1082     return srcLen;
1083 }
1084
1085 #else // !WC_UTF16: wchar_t is UTF-32
1086
1087 // ----------------------------------------------------------------------------
1088 // conversions without endianness change
1089 // ----------------------------------------------------------------------------
1090
1091 size_t
1092 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1093                                const char *src, size_t srcLen) const
1094 {
1095     srcLen = GetLength(src, srcLen);
1096     if ( srcLen == wxNO_LEN )
1097         return wxCONV_FAILED;
1098
1099     const size_t inLen = srcLen / BYTES_PER_CHAR;
1100     if ( !dst )
1101     {
1102         // optimization: return maximal space which could be needed for this
1103         // string even if the real size could be smaller if the buffer contains
1104         // any surrogates
1105         return inLen;
1106     }
1107
1108     size_t outLen = 0;
1109     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1110     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1111     {
1112         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1113         if ( !inBuff )
1114             return wxCONV_FAILED;
1115
1116         if ( ++outLen > dstLen )
1117             return wxCONV_FAILED;
1118
1119         *dst++ = ch;
1120     }
1121
1122
1123     return outLen;
1124 }
1125
1126 size_t
1127 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1128                                  const wchar_t *src, size_t srcLen) const
1129 {
1130     if ( srcLen == wxNO_LEN )
1131         srcLen = wxWcslen(src) + 1;
1132
1133     size_t outLen = 0;
1134     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1135     for ( size_t n = 0; n < srcLen; n++ )
1136     {
1137         wxUint16 cc[2];
1138         const size_t numChars = encode_utf16(*src++, cc);
1139         if ( numChars == wxCONV_FAILED )
1140             return wxCONV_FAILED;
1141
1142         outLen += numChars * BYTES_PER_CHAR;
1143         if ( outBuff )
1144         {
1145             if ( outLen > dstLen )
1146                 return wxCONV_FAILED;
1147
1148             *outBuff++ = cc[0];
1149             if ( numChars == 2 )
1150             {
1151                 // second character of a surrogate
1152                 *outBuff++ = cc[1];
1153             }
1154         }
1155     }
1156
1157     return outLen;
1158 }
1159
1160 // ----------------------------------------------------------------------------
1161 // endian-reversing conversions
1162 // ----------------------------------------------------------------------------
1163
1164 size_t
1165 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1166                            const char *src, size_t srcLen) const
1167 {
1168     srcLen = GetLength(src, srcLen);
1169     if ( srcLen == wxNO_LEN )
1170         return wxCONV_FAILED;
1171
1172     const size_t inLen = srcLen / BYTES_PER_CHAR;
1173     if ( !dst )
1174     {
1175         // optimization: return maximal space which could be needed for this
1176         // string even if the real size could be smaller if the buffer contains
1177         // any surrogates
1178         return inLen;
1179     }
1180
1181     size_t outLen = 0;
1182     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1183     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1184     {
1185         wxUint32 ch;
1186         wxUint16 tmp[2];
1187
1188         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1189         inBuff++;
1190         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1191
1192         const size_t numChars = decode_utf16(tmp, ch);
1193         if ( numChars == wxCONV_FAILED )
1194             return wxCONV_FAILED;
1195
1196         if ( numChars == 2 )
1197             inBuff++;
1198
1199         if ( ++outLen > dstLen )
1200             return wxCONV_FAILED;
1201
1202         *dst++ = ch;
1203     }
1204
1205
1206     return outLen;
1207 }
1208
1209 size_t
1210 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1211                              const wchar_t *src, size_t srcLen) const
1212 {
1213     if ( srcLen == wxNO_LEN )
1214         srcLen = wxWcslen(src) + 1;
1215
1216     size_t outLen = 0;
1217     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1218     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1219     {
1220         wxUint16 cc[2];
1221         const size_t numChars = encode_utf16(*src, cc);
1222         if ( numChars == wxCONV_FAILED )
1223             return wxCONV_FAILED;
1224
1225         outLen += numChars * BYTES_PER_CHAR;
1226         if ( outBuff )
1227         {
1228             if ( outLen > dstLen )
1229                 return wxCONV_FAILED;
1230
1231             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1232             if ( numChars == 2 )
1233             {
1234                 // second character of a surrogate
1235                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1236             }
1237         }
1238     }
1239
1240     return outLen;
1241 }
1242
1243 #endif // WC_UTF16/!WC_UTF16
1244
1245
1246 // ============================================================================
1247 // UTF-32
1248 // ============================================================================
1249
1250 #ifdef WORDS_BIGENDIAN
1251     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1252     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1253 #else
1254     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1255     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1256 #endif
1257
1258
1259 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1260 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1261
1262 /* static */
1263 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1264 {
1265     if ( srcLen == wxNO_LEN )
1266     {
1267         // count the number of bytes in input, including the trailing NULs
1268         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1269         for ( srcLen = 1; *inBuff++; srcLen++ )
1270             ;
1271
1272         srcLen *= BYTES_PER_CHAR;
1273     }
1274     else // we already have the length
1275     {
1276         // we can only convert an entire number of UTF-32 characters
1277         if ( srcLen % BYTES_PER_CHAR )
1278             return wxCONV_FAILED;
1279     }
1280
1281     return srcLen;
1282 }
1283
1284 // case when in-memory representation is UTF-16
1285 #ifdef WC_UTF16
1286
1287 // ----------------------------------------------------------------------------
1288 // conversions without endianness change
1289 // ----------------------------------------------------------------------------
1290
1291 size_t
1292 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1293                                const char *src, size_t srcLen) const
1294 {
1295     srcLen = GetLength(src, srcLen);
1296     if ( srcLen == wxNO_LEN )
1297         return wxCONV_FAILED;
1298
1299     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1300     const size_t inLen = srcLen / BYTES_PER_CHAR;
1301     size_t outLen = 0;
1302     for ( size_t n = 0; n < inLen; n++ )
1303     {
1304         wxUint16 cc[2];
1305         const size_t numChars = encode_utf16(*inBuff++, cc);
1306         if ( numChars == wxCONV_FAILED )
1307             return wxCONV_FAILED;
1308
1309         outLen += numChars;
1310         if ( dst )
1311         {
1312             if ( outLen > dstLen )
1313                 return wxCONV_FAILED;
1314
1315             *dst++ = cc[0];
1316             if ( numChars == 2 )
1317             {
1318                 // second character of a surrogate
1319                 *dst++ = cc[1];
1320             }
1321         }
1322     }
1323
1324     return outLen;
1325 }
1326
1327 size_t
1328 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1329                                  const wchar_t *src, size_t srcLen) const
1330 {
1331     if ( srcLen == wxNO_LEN )
1332         srcLen = wxWcslen(src) + 1;
1333
1334     if ( !dst )
1335     {
1336         // optimization: return maximal space which could be needed for this
1337         // string instead of the exact amount which could be less if there are
1338         // any surrogates in the input
1339         //
1340         // we consider that surrogates are rare enough to make it worthwhile to
1341         // avoid running the loop below at the cost of slightly extra memory
1342         // consumption
1343         return srcLen * BYTES_PER_CHAR;
1344     }
1345
1346     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1347     size_t outLen = 0;
1348     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1349     {
1350         const wxUint32 ch = wxDecodeSurrogate(&src);
1351         if ( !src )
1352             return wxCONV_FAILED;
1353
1354         outLen += BYTES_PER_CHAR;
1355
1356         if ( outLen > dstLen )
1357             return wxCONV_FAILED;
1358
1359         *outBuff++ = ch;
1360     }
1361
1362     return outLen;
1363 }
1364
1365 // ----------------------------------------------------------------------------
1366 // endian-reversing conversions
1367 // ----------------------------------------------------------------------------
1368
1369 size_t
1370 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1371                            const char *src, size_t srcLen) const
1372 {
1373     srcLen = GetLength(src, srcLen);
1374     if ( srcLen == wxNO_LEN )
1375         return wxCONV_FAILED;
1376
1377     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1378     const size_t inLen = srcLen / BYTES_PER_CHAR;
1379     size_t outLen = 0;
1380     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1381     {
1382         wxUint16 cc[2];
1383         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1384         if ( numChars == wxCONV_FAILED )
1385             return wxCONV_FAILED;
1386
1387         outLen += numChars;
1388         if ( dst )
1389         {
1390             if ( outLen > dstLen )
1391                 return wxCONV_FAILED;
1392
1393             *dst++ = cc[0];
1394             if ( numChars == 2 )
1395             {
1396                 // second character of a surrogate
1397                 *dst++ = cc[1];
1398             }
1399         }
1400     }
1401
1402     return outLen;
1403 }
1404
1405 size_t
1406 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1407                              const wchar_t *src, size_t srcLen) const
1408 {
1409     if ( srcLen == wxNO_LEN )
1410         srcLen = wxWcslen(src) + 1;
1411
1412     if ( !dst )
1413     {
1414         // optimization: return maximal space which could be needed for this
1415         // string instead of the exact amount which could be less if there are
1416         // any surrogates in the input
1417         //
1418         // we consider that surrogates are rare enough to make it worthwhile to
1419         // avoid running the loop below at the cost of slightly extra memory
1420         // consumption
1421         return srcLen*BYTES_PER_CHAR;
1422     }
1423
1424     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1425     size_t outLen = 0;
1426     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1427     {
1428         const wxUint32 ch = wxDecodeSurrogate(&src);
1429         if ( !src )
1430             return wxCONV_FAILED;
1431
1432         outLen += BYTES_PER_CHAR;
1433
1434         if ( outLen > dstLen )
1435             return wxCONV_FAILED;
1436
1437         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1438     }
1439
1440     return outLen;
1441 }
1442
1443 #else // !WC_UTF16: wchar_t is UTF-32
1444
1445 // ----------------------------------------------------------------------------
1446 // conversions without endianness change
1447 // ----------------------------------------------------------------------------
1448
1449 size_t
1450 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1451                                const char *src, size_t srcLen) const
1452 {
1453     // use memcpy() as it should be much faster than hand-written loop
1454     srcLen = GetLength(src, srcLen);
1455     if ( srcLen == wxNO_LEN )
1456         return wxCONV_FAILED;
1457
1458     const size_t inLen = srcLen/BYTES_PER_CHAR;
1459     if ( dst )
1460     {
1461         if ( dstLen < inLen )
1462             return wxCONV_FAILED;
1463
1464         memcpy(dst, src, srcLen);
1465     }
1466
1467     return inLen;
1468 }
1469
1470 size_t
1471 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1472                                  const wchar_t *src, size_t srcLen) const
1473 {
1474     if ( srcLen == wxNO_LEN )
1475         srcLen = wxWcslen(src) + 1;
1476
1477     srcLen *= BYTES_PER_CHAR;
1478
1479     if ( dst )
1480     {
1481         if ( dstLen < srcLen )
1482             return wxCONV_FAILED;
1483
1484         memcpy(dst, src, srcLen);
1485     }
1486
1487     return srcLen;
1488 }
1489
1490 // ----------------------------------------------------------------------------
1491 // endian-reversing conversions
1492 // ----------------------------------------------------------------------------
1493
1494 size_t
1495 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1496                            const char *src, size_t srcLen) const
1497 {
1498     srcLen = GetLength(src, srcLen);
1499     if ( srcLen == wxNO_LEN )
1500         return wxCONV_FAILED;
1501
1502     srcLen /= BYTES_PER_CHAR;
1503
1504     if ( dst )
1505     {
1506         if ( dstLen < srcLen )
1507             return wxCONV_FAILED;
1508
1509         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1510         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1511         {
1512             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1513         }
1514     }
1515
1516     return srcLen;
1517 }
1518
1519 size_t
1520 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1521                              const wchar_t *src, size_t srcLen) const
1522 {
1523     if ( srcLen == wxNO_LEN )
1524         srcLen = wxWcslen(src) + 1;
1525
1526     srcLen *= BYTES_PER_CHAR;
1527
1528     if ( dst )
1529     {
1530         if ( dstLen < srcLen )
1531             return wxCONV_FAILED;
1532
1533         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1534         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1535         {
1536             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1537         }
1538     }
1539
1540     return srcLen;
1541 }
1542
1543 #endif // WC_UTF16/!WC_UTF16
1544
1545
1546 // ============================================================================
1547 // The classes doing conversion using the iconv_xxx() functions
1548 // ============================================================================
1549
1550 #ifdef HAVE_ICONV
1551
1552 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1553 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1554 //     (unless there's yet another bug in glibc) the only case when iconv()
1555 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1556 //     left in the input buffer -- when _real_ error occurs,
1557 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1558 //     iconv() failure.
1559 //     [This bug does not appear in glibc 2.2.]
1560 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1561 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1562                                      (errno != E2BIG || bufLeft != 0))
1563 #else
1564 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1565 #endif
1566
1567 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1568
1569 #define ICONV_T_INVALID ((iconv_t)-1)
1570
1571 #if SIZEOF_WCHAR_T == 4
1572     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1573     #define WC_ENC      wxFONTENCODING_UTF32
1574 #elif SIZEOF_WCHAR_T == 2
1575     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1576     #define WC_ENC      wxFONTENCODING_UTF16
1577 #else // sizeof(wchar_t) != 2 nor 4
1578     // does this ever happen?
1579     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1580 #endif
1581
1582 // ----------------------------------------------------------------------------
1583 // wxMBConv_iconv: encapsulates an iconv character set
1584 // ----------------------------------------------------------------------------
1585
1586 class wxMBConv_iconv : public wxMBConv
1587 {
1588 public:
1589     wxMBConv_iconv(const wxChar *name);
1590     virtual ~wxMBConv_iconv();
1591
1592     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1593     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1594
1595     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1596     virtual size_t GetMBNulLen() const;
1597
1598     virtual wxMBConv *Clone() const
1599     {
1600         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1601         p->m_minMBCharWidth = m_minMBCharWidth;
1602         return p;
1603     }
1604
1605     bool IsOk() const
1606         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1607
1608 protected:
1609     // the iconv handlers used to translate from multibyte
1610     // to wide char and in the other direction
1611     iconv_t m2w,
1612             w2m;
1613
1614 #if wxUSE_THREADS
1615     // guards access to m2w and w2m objects
1616     wxMutex m_iconvMutex;
1617 #endif
1618
1619 private:
1620     // the name (for iconv_open()) of a wide char charset -- if none is
1621     // available on this machine, it will remain NULL
1622     static wxString ms_wcCharsetName;
1623
1624     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1625     // different endian-ness than the native one
1626     static bool ms_wcNeedsSwap;
1627
1628
1629     // name of the encoding handled by this conversion
1630     wxString m_name;
1631
1632     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1633     // initially
1634     size_t m_minMBCharWidth;
1635 };
1636
1637 // make the constructor available for unit testing
1638 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1639 {
1640     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1641     if ( !result->IsOk() )
1642     {
1643         delete result;
1644         return 0;
1645     }
1646
1647     return result;
1648 }
1649
1650 wxString wxMBConv_iconv::ms_wcCharsetName;
1651 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1652
1653 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1654               : m_name(name)
1655 {
1656     m_minMBCharWidth = 0;
1657
1658     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1659     // names for the charsets
1660     const wxCharBuffer cname(wxString(name).ToAscii());
1661
1662     // check for charset that represents wchar_t:
1663     if ( ms_wcCharsetName.empty() )
1664     {
1665         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1666
1667 #if wxUSE_FONTMAP
1668         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1669 #else // !wxUSE_FONTMAP
1670         static const wxChar *names[] =
1671         {
1672 #if SIZEOF_WCHAR_T == 4
1673             _T("UCS-4"),
1674 #elif SIZEOF_WCHAR_T = 2
1675             _T("UCS-2"),
1676 #endif
1677             NULL
1678         };
1679 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1680
1681         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1682         {
1683             const wxString nameCS(*names);
1684
1685             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1686             wxString nameXE(nameCS);
1687
1688 #ifdef WORDS_BIGENDIAN
1689                 nameXE += _T("BE");
1690 #else // little endian
1691                 nameXE += _T("LE");
1692 #endif
1693
1694             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1695                        nameXE.c_str());
1696
1697             m2w = iconv_open(nameXE.ToAscii(), cname);
1698             if ( m2w == ICONV_T_INVALID )
1699             {
1700                 // try charset w/o bytesex info (e.g. "UCS4")
1701                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1702                            nameCS.c_str());
1703                 m2w = iconv_open(nameCS.ToAscii(), cname);
1704
1705                 // and check for bytesex ourselves:
1706                 if ( m2w != ICONV_T_INVALID )
1707                 {
1708                     char    buf[2], *bufPtr;
1709                     wchar_t wbuf[2], *wbufPtr;
1710                     size_t  insz, outsz;
1711                     size_t  res;
1712
1713                     buf[0] = 'A';
1714                     buf[1] = 0;
1715                     wbuf[0] = 0;
1716                     insz = 2;
1717                     outsz = SIZEOF_WCHAR_T * 2;
1718                     wbufPtr = wbuf;
1719                     bufPtr = buf;
1720
1721                     res = iconv(
1722                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1723                         (char**)&wbufPtr, &outsz);
1724
1725                     if (ICONV_FAILED(res, insz))
1726                     {
1727                         wxLogLastError(wxT("iconv"));
1728                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1729                                    nameCS.c_str());
1730                     }
1731                     else // ok, can convert to this encoding, remember it
1732                     {
1733                         ms_wcCharsetName = nameCS;
1734                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1735                     }
1736                 }
1737             }
1738             else // use charset not requiring byte swapping
1739             {
1740                 ms_wcCharsetName = nameXE;
1741             }
1742         }
1743
1744         wxLogTrace(TRACE_STRCONV,
1745                    wxT("iconv wchar_t charset is \"%s\"%s"),
1746                    ms_wcCharsetName.empty() ? _T("<none>")
1747                                             : ms_wcCharsetName.c_str(),
1748                    ms_wcNeedsSwap ? _T(" (needs swap)")
1749                                   : _T(""));
1750     }
1751     else // we already have ms_wcCharsetName
1752     {
1753         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1754     }
1755
1756     if ( ms_wcCharsetName.empty() )
1757     {
1758         w2m = ICONV_T_INVALID;
1759     }
1760     else
1761     {
1762         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1763         if ( w2m == ICONV_T_INVALID )
1764         {
1765             wxLogTrace(TRACE_STRCONV,
1766                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1767                        ms_wcCharsetName.c_str(), cname.data());
1768         }
1769     }
1770 }
1771
1772 wxMBConv_iconv::~wxMBConv_iconv()
1773 {
1774     if ( m2w != ICONV_T_INVALID )
1775         iconv_close(m2w);
1776     if ( w2m != ICONV_T_INVALID )
1777         iconv_close(w2m);
1778 }
1779
1780 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1781 {
1782     // find the string length: notice that must be done differently for
1783     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1784     size_t inbuf;
1785     const size_t nulLen = GetMBNulLen();
1786     switch ( nulLen )
1787     {
1788         default:
1789             return wxCONV_FAILED;
1790
1791         case 1:
1792             inbuf = strlen(psz); // arguably more optimized than our version
1793             break;
1794
1795         case 2:
1796         case 4:
1797             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1798             // they also have to start at character boundary and not span two
1799             // adjacent characters
1800             const char *p;
1801             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1802                 ;
1803             inbuf = p - psz;
1804             break;
1805     }
1806
1807 #if wxUSE_THREADS
1808     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1809     //     Unfortunately there is a couple of global wxCSConv objects such as
1810     //     wxConvLocal that are used all over wx code, so we have to make sure
1811     //     the handle is used by at most one thread at the time. Otherwise
1812     //     only a few wx classes would be safe to use from non-main threads
1813     //     as MB<->WC conversion would fail "randomly".
1814     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1815 #endif // wxUSE_THREADS
1816
1817     size_t outbuf = n * SIZEOF_WCHAR_T;
1818     size_t res, cres;
1819     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1820     wchar_t *bufPtr = buf;
1821     const char *pszPtr = psz;
1822
1823     if (buf)
1824     {
1825         // have destination buffer, convert there
1826         cres = iconv(m2w,
1827                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1828                      (char**)&bufPtr, &outbuf);
1829         res = n - (outbuf / SIZEOF_WCHAR_T);
1830
1831         if (ms_wcNeedsSwap)
1832         {
1833             // convert to native endianness
1834             for ( unsigned i = 0; i < res; i++ )
1835                 buf[n] = WC_BSWAP(buf[i]);
1836         }
1837
1838         // NUL-terminate the string if there is any space left
1839         if (res < n)
1840             buf[res] = 0;
1841     }
1842     else
1843     {
1844         // no destination buffer... convert using temp buffer
1845         // to calculate destination buffer requirement
1846         wchar_t tbuf[8];
1847         res = 0;
1848
1849         do
1850         {
1851             bufPtr = tbuf;
1852             outbuf = 8 * SIZEOF_WCHAR_T;
1853
1854             cres = iconv(m2w,
1855                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1856                          (char**)&bufPtr, &outbuf );
1857
1858             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1859         }
1860         while ((cres == (size_t)-1) && (errno == E2BIG));
1861     }
1862
1863     if (ICONV_FAILED(cres, inbuf))
1864     {
1865         //VS: it is ok if iconv fails, hence trace only
1866         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1867         return wxCONV_FAILED;
1868     }
1869
1870     return res;
1871 }
1872
1873 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1874 {
1875 #if wxUSE_THREADS
1876     // NB: explained in MB2WC
1877     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1878 #endif
1879
1880     size_t inlen = wxWcslen(psz);
1881     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1882     size_t outbuf = n;
1883     size_t res, cres;
1884
1885     wchar_t *tmpbuf = 0;
1886
1887     if (ms_wcNeedsSwap)
1888     {
1889         // need to copy to temp buffer to switch endianness
1890         // (doing WC_BSWAP twice on the original buffer won't help, as it
1891         //  could be in read-only memory, or be accessed in some other thread)
1892         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1893         for ( size_t i = 0; i < inlen; i++ )
1894             tmpbuf[n] = WC_BSWAP(psz[i]);
1895
1896         tmpbuf[inlen] = L'\0';
1897         psz = tmpbuf;
1898     }
1899
1900     if (buf)
1901     {
1902         // have destination buffer, convert there
1903         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1904
1905         res = n - outbuf;
1906
1907         // NB: iconv was given only wcslen(psz) characters on input, and so
1908         //     it couldn't convert the trailing zero. Let's do it ourselves
1909         //     if there's some room left for it in the output buffer.
1910         if (res < n)
1911             buf[0] = 0;
1912     }
1913     else
1914     {
1915         // no destination buffer: convert using temp buffer
1916         // to calculate destination buffer requirement
1917         char tbuf[16];
1918         res = 0;
1919         do
1920         {
1921             buf = tbuf;
1922             outbuf = 16;
1923
1924             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1925
1926             res += 16 - outbuf;
1927         }
1928         while ((cres == (size_t)-1) && (errno == E2BIG));
1929     }
1930
1931     if (ms_wcNeedsSwap)
1932     {
1933         free(tmpbuf);
1934     }
1935
1936     if (ICONV_FAILED(cres, inbuf))
1937     {
1938         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1939         return wxCONV_FAILED;
1940     }
1941
1942     return res;
1943 }
1944
1945 size_t wxMBConv_iconv::GetMBNulLen() const
1946 {
1947     if ( m_minMBCharWidth == 0 )
1948     {
1949         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1950
1951 #if wxUSE_THREADS
1952         // NB: explained in MB2WC
1953         wxMutexLocker lock(self->m_iconvMutex);
1954 #endif
1955
1956         wchar_t *wnul = L"";
1957         char buf[8]; // should be enough for NUL in any encoding
1958         size_t inLen = sizeof(wchar_t),
1959                outLen = WXSIZEOF(buf);
1960         char *inBuff = (char *)wnul;
1961         char *outBuff = buf;
1962         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1963         {
1964             self->m_minMBCharWidth = (size_t)-1;
1965         }
1966         else // ok
1967         {
1968             self->m_minMBCharWidth = outBuff - buf;
1969         }
1970     }
1971
1972     return m_minMBCharWidth;
1973 }
1974
1975 #endif // HAVE_ICONV
1976
1977
1978 // ============================================================================
1979 // Win32 conversion classes
1980 // ============================================================================
1981
1982 #ifdef wxHAVE_WIN32_MB2WC
1983
1984 // from utils.cpp
1985 #if wxUSE_FONTMAP
1986 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1987 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1988 #endif
1989
1990 class wxMBConv_win32 : public wxMBConv
1991 {
1992 public:
1993     wxMBConv_win32()
1994     {
1995         m_CodePage = CP_ACP;
1996         m_minMBCharWidth = 0;
1997     }
1998
1999     wxMBConv_win32(const wxMBConv_win32& conv)
2000         : wxMBConv()
2001     {
2002         m_CodePage = conv.m_CodePage;
2003         m_minMBCharWidth = conv.m_minMBCharWidth;
2004     }
2005
2006 #if wxUSE_FONTMAP
2007     wxMBConv_win32(const wxChar* name)
2008     {
2009         m_CodePage = wxCharsetToCodepage(name);
2010         m_minMBCharWidth = 0;
2011     }
2012
2013     wxMBConv_win32(wxFontEncoding encoding)
2014     {
2015         m_CodePage = wxEncodingToCodepage(encoding);
2016         m_minMBCharWidth = 0;
2017     }
2018 #endif // wxUSE_FONTMAP
2019
2020     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2021     {
2022         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2023         // the behaviour is not compatible with the Unix version (using iconv)
2024         // and break the library itself, e.g. wxTextInputStream::NextChar()
2025         // wouldn't work if reading an incomplete MB char didn't result in an
2026         // error
2027         //
2028         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2029         // Win XP or newer and it is not supported for UTF-[78] so we always
2030         // use our own conversions in this case. See
2031         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2032         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2033         if ( m_CodePage == CP_UTF8 )
2034         {
2035             return wxConvUTF8.MB2WC(buf, psz, n);
2036         }
2037
2038         if ( m_CodePage == CP_UTF7 )
2039         {
2040             return wxConvUTF7.MB2WC(buf, psz, n);
2041         }
2042
2043         int flags = 0;
2044         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2045                 IsAtLeastWin2kSP4() )
2046         {
2047             flags = MB_ERR_INVALID_CHARS;
2048         }
2049
2050         const size_t len = ::MultiByteToWideChar
2051                              (
2052                                 m_CodePage,     // code page
2053                                 flags,          // flags: fall on error
2054                                 psz,            // input string
2055                                 -1,             // its length (NUL-terminated)
2056                                 buf,            // output string
2057                                 buf ? n : 0     // size of output buffer
2058                              );
2059         if ( !len )
2060         {
2061             // function totally failed
2062             return wxCONV_FAILED;
2063         }
2064
2065         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2066         // check if we succeeded, by doing a double trip:
2067         if ( !flags && buf )
2068         {
2069             const size_t mbLen = strlen(psz);
2070             wxCharBuffer mbBuf(mbLen);
2071             if ( ::WideCharToMultiByte
2072                    (
2073                       m_CodePage,
2074                       0,
2075                       buf,
2076                       -1,
2077                       mbBuf.data(),
2078                       mbLen + 1,        // size in bytes, not length
2079                       NULL,
2080                       NULL
2081                    ) == 0 ||
2082                   strcmp(mbBuf, psz) != 0 )
2083             {
2084                 // we didn't obtain the same thing we started from, hence
2085                 // the conversion was lossy and we consider that it failed
2086                 return wxCONV_FAILED;
2087             }
2088         }
2089
2090         // note that it returns count of written chars for buf != NULL and size
2091         // of the needed buffer for buf == NULL so in either case the length of
2092         // the string (which never includes the terminating NUL) is one less
2093         return len - 1;
2094     }
2095
2096     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2097     {
2098         /*
2099             we have a problem here: by default, WideCharToMultiByte() may
2100             replace characters unrepresentable in the target code page with bad
2101             quality approximations such as turning "1/2" symbol (U+00BD) into
2102             "1" for the code pages which don't have it and we, obviously, want
2103             to avoid this at any price
2104
2105             the trouble is that this function does it _silently_, i.e. it won't
2106             even tell us whether it did or not... Win98/2000 and higher provide
2107             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2108             we have to resort to a round trip, i.e. check that converting back
2109             results in the same string -- this is, of course, expensive but
2110             otherwise we simply can't be sure to not garble the data.
2111          */
2112
2113         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2114         // it doesn't work with CJK encodings (which we test for rather roughly
2115         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2116         // supporting it
2117         BOOL usedDef wxDUMMY_INITIALIZE(false);
2118         BOOL *pUsedDef;
2119         int flags;
2120         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2121         {
2122             // it's our lucky day
2123             flags = WC_NO_BEST_FIT_CHARS;
2124             pUsedDef = &usedDef;
2125         }
2126         else // old system or unsupported encoding
2127         {
2128             flags = 0;
2129             pUsedDef = NULL;
2130         }
2131
2132         const size_t len = ::WideCharToMultiByte
2133                              (
2134                                 m_CodePage,     // code page
2135                                 flags,          // either none or no best fit
2136                                 pwz,            // input string
2137                                 -1,             // it is (wide) NUL-terminated
2138                                 buf,            // output buffer
2139                                 buf ? n : 0,    // and its size
2140                                 NULL,           // default "replacement" char
2141                                 pUsedDef        // [out] was it used?
2142                              );
2143
2144         if ( !len )
2145         {
2146             // function totally failed
2147             return wxCONV_FAILED;
2148         }
2149
2150         // if we were really converting, check if we succeeded
2151         if ( buf )
2152         {
2153             if ( flags )
2154             {
2155                 // check if the conversion failed, i.e. if any replacements
2156                 // were done
2157                 if ( usedDef )
2158                     return wxCONV_FAILED;
2159             }
2160             else // we must resort to double tripping...
2161             {
2162                 wxWCharBuffer wcBuf(n);
2163                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2164                         wcscmp(wcBuf, pwz) != 0 )
2165                 {
2166                     // we didn't obtain the same thing we started from, hence
2167                     // the conversion was lossy and we consider that it failed
2168                     return wxCONV_FAILED;
2169                 }
2170             }
2171         }
2172
2173         // see the comment above for the reason of "len - 1"
2174         return len - 1;
2175     }
2176
2177     virtual size_t GetMBNulLen() const
2178     {
2179         if ( m_minMBCharWidth == 0 )
2180         {
2181             int len = ::WideCharToMultiByte
2182                         (
2183                             m_CodePage,     // code page
2184                             0,              // no flags
2185                             L"",            // input string
2186                             1,              // translate just the NUL
2187                             NULL,           // output buffer
2188                             0,              // and its size
2189                             NULL,           // no replacement char
2190                             NULL            // [out] don't care if it was used
2191                         );
2192
2193             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2194             switch ( len )
2195             {
2196                 default:
2197                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2198                     self->m_minMBCharWidth = (size_t)-1;
2199                     break;
2200
2201                 case 0:
2202                     self->m_minMBCharWidth = (size_t)-1;
2203                     break;
2204
2205                 case 1:
2206                 case 2:
2207                 case 4:
2208                     self->m_minMBCharWidth = len;
2209                     break;
2210             }
2211         }
2212
2213         return m_minMBCharWidth;
2214     }
2215
2216     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2217
2218     bool IsOk() const { return m_CodePage != -1; }
2219
2220 private:
2221     static bool CanUseNoBestFit()
2222     {
2223         static int s_isWin98Or2k = -1;
2224
2225         if ( s_isWin98Or2k == -1 )
2226         {
2227             int verMaj, verMin;
2228             switch ( wxGetOsVersion(&verMaj, &verMin) )
2229             {
2230                 case wxWIN95:
2231                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2232                     break;
2233
2234                 case wxWINDOWS_NT:
2235                     s_isWin98Or2k = verMaj >= 5;
2236                     break;
2237
2238                 default:
2239                     // unknown: be conservative by default
2240                     s_isWin98Or2k = 0;
2241                     break;
2242             }
2243
2244             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2245         }
2246
2247         return s_isWin98Or2k == 1;
2248     }
2249
2250     static bool IsAtLeastWin2kSP4()
2251     {
2252 #ifdef __WXWINCE__
2253         return false;
2254 #else
2255         static int s_isAtLeastWin2kSP4 = -1;
2256
2257         if ( s_isAtLeastWin2kSP4 == -1 )
2258         {
2259             OSVERSIONINFOEX ver;
2260
2261             memset(&ver, 0, sizeof(ver));
2262             ver.dwOSVersionInfoSize = sizeof(ver);
2263             GetVersionEx((OSVERSIONINFO*)&ver);
2264
2265             s_isAtLeastWin2kSP4 =
2266               ((ver.dwMajorVersion > 5) || // Vista+
2267                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2268                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2269                ver.wServicePackMajor >= 4)) // 2000 SP4+
2270               ? 1 : 0;
2271         }
2272
2273         return s_isAtLeastWin2kSP4 == 1;
2274 #endif
2275     }
2276
2277
2278     // the code page we're working with
2279     long m_CodePage;
2280
2281     // cached result of GetMBNulLen(), set to 0 initially meaning
2282     // "unknown"
2283     size_t m_minMBCharWidth;
2284 };
2285
2286 #endif // wxHAVE_WIN32_MB2WC
2287
2288 // ============================================================================
2289 // Cocoa conversion classes
2290 // ============================================================================
2291
2292 #if defined(__WXCOCOA__)
2293
2294 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2295 // Strangely enough, internally Core Foundation uses
2296 // UTF-32 internally quite a bit - its just not public (yet).
2297
2298 #include <CoreFoundation/CFString.h>
2299 #include <CoreFoundation/CFStringEncodingExt.h>
2300
2301 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2302 {
2303     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2304
2305     switch (encoding)
2306     {
2307         case wxFONTENCODING_DEFAULT :
2308             enc = CFStringGetSystemEncoding();
2309             break ;
2310
2311         case wxFONTENCODING_ISO8859_1 :
2312             enc = kCFStringEncodingISOLatin1 ;
2313             break ;
2314         case wxFONTENCODING_ISO8859_2 :
2315             enc = kCFStringEncodingISOLatin2;
2316             break ;
2317         case wxFONTENCODING_ISO8859_3 :
2318             enc = kCFStringEncodingISOLatin3 ;
2319             break ;
2320         case wxFONTENCODING_ISO8859_4 :
2321             enc = kCFStringEncodingISOLatin4;
2322             break ;
2323         case wxFONTENCODING_ISO8859_5 :
2324             enc = kCFStringEncodingISOLatinCyrillic;
2325             break ;
2326         case wxFONTENCODING_ISO8859_6 :
2327             enc = kCFStringEncodingISOLatinArabic;
2328             break ;
2329         case wxFONTENCODING_ISO8859_7 :
2330             enc = kCFStringEncodingISOLatinGreek;
2331             break ;
2332         case wxFONTENCODING_ISO8859_8 :
2333             enc = kCFStringEncodingISOLatinHebrew;
2334             break ;
2335         case wxFONTENCODING_ISO8859_9 :
2336             enc = kCFStringEncodingISOLatin5;
2337             break ;
2338         case wxFONTENCODING_ISO8859_10 :
2339             enc = kCFStringEncodingISOLatin6;
2340             break ;
2341         case wxFONTENCODING_ISO8859_11 :
2342             enc = kCFStringEncodingISOLatinThai;
2343             break ;
2344         case wxFONTENCODING_ISO8859_13 :
2345             enc = kCFStringEncodingISOLatin7;
2346             break ;
2347         case wxFONTENCODING_ISO8859_14 :
2348             enc = kCFStringEncodingISOLatin8;
2349             break ;
2350         case wxFONTENCODING_ISO8859_15 :
2351             enc = kCFStringEncodingISOLatin9;
2352             break ;
2353
2354         case wxFONTENCODING_KOI8 :
2355             enc = kCFStringEncodingKOI8_R;
2356             break ;
2357         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2358             enc = kCFStringEncodingDOSRussian;
2359             break ;
2360
2361 //      case wxFONTENCODING_BULGARIAN :
2362 //          enc = ;
2363 //          break ;
2364
2365         case wxFONTENCODING_CP437 :
2366             enc = kCFStringEncodingDOSLatinUS ;
2367             break ;
2368         case wxFONTENCODING_CP850 :
2369             enc = kCFStringEncodingDOSLatin1;
2370             break ;
2371         case wxFONTENCODING_CP852 :
2372             enc = kCFStringEncodingDOSLatin2;
2373             break ;
2374         case wxFONTENCODING_CP855 :
2375             enc = kCFStringEncodingDOSCyrillic;
2376             break ;
2377         case wxFONTENCODING_CP866 :
2378             enc = kCFStringEncodingDOSRussian ;
2379             break ;
2380         case wxFONTENCODING_CP874 :
2381             enc = kCFStringEncodingDOSThai;
2382             break ;
2383         case wxFONTENCODING_CP932 :
2384             enc = kCFStringEncodingDOSJapanese;
2385             break ;
2386         case wxFONTENCODING_CP936 :
2387             enc = kCFStringEncodingDOSChineseSimplif ;
2388             break ;
2389         case wxFONTENCODING_CP949 :
2390             enc = kCFStringEncodingDOSKorean;
2391             break ;
2392         case wxFONTENCODING_CP950 :
2393             enc = kCFStringEncodingDOSChineseTrad;
2394             break ;
2395         case wxFONTENCODING_CP1250 :
2396             enc = kCFStringEncodingWindowsLatin2;
2397             break ;
2398         case wxFONTENCODING_CP1251 :
2399             enc = kCFStringEncodingWindowsCyrillic ;
2400             break ;
2401         case wxFONTENCODING_CP1252 :
2402             enc = kCFStringEncodingWindowsLatin1 ;
2403             break ;
2404         case wxFONTENCODING_CP1253 :
2405             enc = kCFStringEncodingWindowsGreek;
2406             break ;
2407         case wxFONTENCODING_CP1254 :
2408             enc = kCFStringEncodingWindowsLatin5;
2409             break ;
2410         case wxFONTENCODING_CP1255 :
2411             enc = kCFStringEncodingWindowsHebrew ;
2412             break ;
2413         case wxFONTENCODING_CP1256 :
2414             enc = kCFStringEncodingWindowsArabic ;
2415             break ;
2416         case wxFONTENCODING_CP1257 :
2417             enc = kCFStringEncodingWindowsBalticRim;
2418             break ;
2419 //   This only really encodes to UTF7 (if that) evidently
2420 //        case wxFONTENCODING_UTF7 :
2421 //            enc = kCFStringEncodingNonLossyASCII ;
2422 //            break ;
2423         case wxFONTENCODING_UTF8 :
2424             enc = kCFStringEncodingUTF8 ;
2425             break ;
2426         case wxFONTENCODING_EUC_JP :
2427             enc = kCFStringEncodingEUC_JP;
2428             break ;
2429         case wxFONTENCODING_UTF16 :
2430             enc = kCFStringEncodingUnicode ;
2431             break ;
2432         case wxFONTENCODING_MACROMAN :
2433             enc = kCFStringEncodingMacRoman ;
2434             break ;
2435         case wxFONTENCODING_MACJAPANESE :
2436             enc = kCFStringEncodingMacJapanese ;
2437             break ;
2438         case wxFONTENCODING_MACCHINESETRAD :
2439             enc = kCFStringEncodingMacChineseTrad ;
2440             break ;
2441         case wxFONTENCODING_MACKOREAN :
2442             enc = kCFStringEncodingMacKorean ;
2443             break ;
2444         case wxFONTENCODING_MACARABIC :
2445             enc = kCFStringEncodingMacArabic ;
2446             break ;
2447         case wxFONTENCODING_MACHEBREW :
2448             enc = kCFStringEncodingMacHebrew ;
2449             break ;
2450         case wxFONTENCODING_MACGREEK :
2451             enc = kCFStringEncodingMacGreek ;
2452             break ;
2453         case wxFONTENCODING_MACCYRILLIC :
2454             enc = kCFStringEncodingMacCyrillic ;
2455             break ;
2456         case wxFONTENCODING_MACDEVANAGARI :
2457             enc = kCFStringEncodingMacDevanagari ;
2458             break ;
2459         case wxFONTENCODING_MACGURMUKHI :
2460             enc = kCFStringEncodingMacGurmukhi ;
2461             break ;
2462         case wxFONTENCODING_MACGUJARATI :
2463             enc = kCFStringEncodingMacGujarati ;
2464             break ;
2465         case wxFONTENCODING_MACORIYA :
2466             enc = kCFStringEncodingMacOriya ;
2467             break ;
2468         case wxFONTENCODING_MACBENGALI :
2469             enc = kCFStringEncodingMacBengali ;
2470             break ;
2471         case wxFONTENCODING_MACTAMIL :
2472             enc = kCFStringEncodingMacTamil ;
2473             break ;
2474         case wxFONTENCODING_MACTELUGU :
2475             enc = kCFStringEncodingMacTelugu ;
2476             break ;
2477         case wxFONTENCODING_MACKANNADA :
2478             enc = kCFStringEncodingMacKannada ;
2479             break ;
2480         case wxFONTENCODING_MACMALAJALAM :
2481             enc = kCFStringEncodingMacMalayalam ;
2482             break ;
2483         case wxFONTENCODING_MACSINHALESE :
2484             enc = kCFStringEncodingMacSinhalese ;
2485             break ;
2486         case wxFONTENCODING_MACBURMESE :
2487             enc = kCFStringEncodingMacBurmese ;
2488             break ;
2489         case wxFONTENCODING_MACKHMER :
2490             enc = kCFStringEncodingMacKhmer ;
2491             break ;
2492         case wxFONTENCODING_MACTHAI :
2493             enc = kCFStringEncodingMacThai ;
2494             break ;
2495         case wxFONTENCODING_MACLAOTIAN :
2496             enc = kCFStringEncodingMacLaotian ;
2497             break ;
2498         case wxFONTENCODING_MACGEORGIAN :
2499             enc = kCFStringEncodingMacGeorgian ;
2500             break ;
2501         case wxFONTENCODING_MACARMENIAN :
2502             enc = kCFStringEncodingMacArmenian ;
2503             break ;
2504         case wxFONTENCODING_MACCHINESESIMP :
2505             enc = kCFStringEncodingMacChineseSimp ;
2506             break ;
2507         case wxFONTENCODING_MACTIBETAN :
2508             enc = kCFStringEncodingMacTibetan ;
2509             break ;
2510         case wxFONTENCODING_MACMONGOLIAN :
2511             enc = kCFStringEncodingMacMongolian ;
2512             break ;
2513         case wxFONTENCODING_MACETHIOPIC :
2514             enc = kCFStringEncodingMacEthiopic ;
2515             break ;
2516         case wxFONTENCODING_MACCENTRALEUR :
2517             enc = kCFStringEncodingMacCentralEurRoman ;
2518             break ;
2519         case wxFONTENCODING_MACVIATNAMESE :
2520             enc = kCFStringEncodingMacVietnamese ;
2521             break ;
2522         case wxFONTENCODING_MACARABICEXT :
2523             enc = kCFStringEncodingMacExtArabic ;
2524             break ;
2525         case wxFONTENCODING_MACSYMBOL :
2526             enc = kCFStringEncodingMacSymbol ;
2527             break ;
2528         case wxFONTENCODING_MACDINGBATS :
2529             enc = kCFStringEncodingMacDingbats ;
2530             break ;
2531         case wxFONTENCODING_MACTURKISH :
2532             enc = kCFStringEncodingMacTurkish ;
2533             break ;
2534         case wxFONTENCODING_MACCROATIAN :
2535             enc = kCFStringEncodingMacCroatian ;
2536             break ;
2537         case wxFONTENCODING_MACICELANDIC :
2538             enc = kCFStringEncodingMacIcelandic ;
2539             break ;
2540         case wxFONTENCODING_MACROMANIAN :
2541             enc = kCFStringEncodingMacRomanian ;
2542             break ;
2543         case wxFONTENCODING_MACCELTIC :
2544             enc = kCFStringEncodingMacCeltic ;
2545             break ;
2546         case wxFONTENCODING_MACGAELIC :
2547             enc = kCFStringEncodingMacGaelic ;
2548             break ;
2549 //      case wxFONTENCODING_MACKEYBOARD :
2550 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2551 //          break ;
2552
2553         default :
2554             // because gcc is picky
2555             break ;
2556     }
2557
2558     return enc ;
2559 }
2560
2561 class wxMBConv_cocoa : public wxMBConv
2562 {
2563 public:
2564     wxMBConv_cocoa()
2565     {
2566         Init(CFStringGetSystemEncoding()) ;
2567     }
2568
2569     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2570     {
2571         m_encoding = conv.m_encoding;
2572     }
2573
2574 #if wxUSE_FONTMAP
2575     wxMBConv_cocoa(const wxChar* name)
2576     {
2577         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2578     }
2579 #endif
2580
2581     wxMBConv_cocoa(wxFontEncoding encoding)
2582     {
2583         Init( wxCFStringEncFromFontEnc(encoding) );
2584     }
2585
2586     ~wxMBConv_cocoa()
2587     {
2588     }
2589
2590     void Init( CFStringEncoding encoding)
2591     {
2592         m_encoding = encoding ;
2593     }
2594
2595     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2596     {
2597         wxASSERT(szUnConv);
2598
2599         CFStringRef theString = CFStringCreateWithBytes (
2600                                                 NULL, //the allocator
2601                                                 (const UInt8*)szUnConv,
2602                                                 strlen(szUnConv),
2603                                                 m_encoding,
2604                                                 false //no BOM/external representation
2605                                                 );
2606
2607         wxASSERT(theString);
2608
2609         size_t nOutLength = CFStringGetLength(theString);
2610
2611         if (szOut == NULL)
2612         {
2613             CFRelease(theString);
2614             return nOutLength;
2615         }
2616
2617         CFRange theRange = { 0, nOutSize };
2618
2619 #if SIZEOF_WCHAR_T == 4
2620         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2621 #endif
2622
2623         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2624
2625         CFRelease(theString);
2626
2627         szUniCharBuffer[nOutLength] = '\0';
2628
2629 #if SIZEOF_WCHAR_T == 4
2630         wxMBConvUTF16 converter;
2631         converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2632         delete [] szUniCharBuffer;
2633 #endif
2634
2635         return nOutLength;
2636     }
2637
2638     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2639     {
2640         wxASSERT(szUnConv);
2641
2642         size_t nRealOutSize;
2643         size_t nBufSize = wxWcslen(szUnConv);
2644         UniChar* szUniBuffer = (UniChar*) szUnConv;
2645
2646 #if SIZEOF_WCHAR_T == 4
2647         wxMBConvUTF16 converter ;
2648         nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2649         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2650         converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2651         nBufSize /= sizeof(UniChar);
2652 #endif
2653
2654         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2655                                 NULL, //allocator
2656                                 szUniBuffer,
2657                                 nBufSize,
2658                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2659                             );
2660
2661         wxASSERT(theString);
2662
2663         //Note that CER puts a BOM when converting to unicode
2664         //so we  check and use getchars instead in that case
2665         if (m_encoding == kCFStringEncodingUnicode)
2666         {
2667             if (szOut != NULL)
2668                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2669
2670             nRealOutSize = CFStringGetLength(theString) + 1;
2671         }
2672         else
2673         {
2674             CFStringGetBytes(
2675                 theString,
2676                 CFRangeMake(0, CFStringGetLength(theString)),
2677                 m_encoding,
2678                 0, //what to put in characters that can't be converted -
2679                     //0 tells CFString to return NULL if it meets such a character
2680                 false, //not an external representation
2681                 (UInt8*) szOut,
2682                 nOutSize,
2683                 (CFIndex*) &nRealOutSize
2684                         );
2685         }
2686
2687         CFRelease(theString);
2688
2689 #if SIZEOF_WCHAR_T == 4
2690         delete[] szUniBuffer;
2691 #endif
2692
2693         return  nRealOutSize - 1;
2694     }
2695
2696     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2697
2698     bool IsOk() const
2699     {
2700         return m_encoding != kCFStringEncodingInvalidId &&
2701               CFStringIsEncodingAvailable(m_encoding);
2702     }
2703
2704 private:
2705     CFStringEncoding m_encoding ;
2706 };
2707
2708 #endif // defined(__WXCOCOA__)
2709
2710 // ============================================================================
2711 // Mac conversion classes
2712 // ============================================================================
2713
2714 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2715
2716 class wxMBConv_mac : public wxMBConv
2717 {
2718 public:
2719     wxMBConv_mac()
2720     {
2721         Init(CFStringGetSystemEncoding()) ;
2722     }
2723
2724     wxMBConv_mac(const wxMBConv_mac& conv)
2725     {
2726         Init(conv.m_char_encoding);
2727     }
2728
2729 #if wxUSE_FONTMAP
2730     wxMBConv_mac(const wxChar* name)
2731     {
2732         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2733     }
2734 #endif
2735
2736     wxMBConv_mac(wxFontEncoding encoding)
2737     {
2738         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2739     }
2740
2741     ~wxMBConv_mac()
2742     {
2743         OSStatus status = noErr ;
2744         status = TECDisposeConverter(m_MB2WC_converter);
2745         status = TECDisposeConverter(m_WC2MB_converter);
2746     }
2747
2748
2749     void Init( TextEncodingBase encoding)
2750     {
2751         OSStatus status = noErr ;
2752         m_char_encoding = encoding ;
2753         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2754
2755         status = TECCreateConverter(&m_MB2WC_converter,
2756                                     m_char_encoding,
2757                                     m_unicode_encoding);
2758         status = TECCreateConverter(&m_WC2MB_converter,
2759                                     m_unicode_encoding,
2760                                     m_char_encoding);
2761     }
2762
2763     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2764     {
2765         OSStatus status = noErr ;
2766         ByteCount byteOutLen ;
2767         ByteCount byteInLen = strlen(psz) + 1;
2768         wchar_t *tbuf = NULL ;
2769         UniChar* ubuf = NULL ;
2770         size_t res = 0 ;
2771
2772         if (buf == NULL)
2773         {
2774             // Apple specs say at least 32
2775             n = wxMax( 32, byteInLen ) ;
2776             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2777         }
2778
2779         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2780
2781 #if SIZEOF_WCHAR_T == 4
2782         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2783 #else
2784         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2785 #endif
2786
2787         status = TECConvertText(
2788             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2789             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2790
2791 #if SIZEOF_WCHAR_T == 4
2792         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2793         // is not properly terminated we get random characters at the end
2794         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2795         wxMBConvUTF16 converter ;
2796         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2797         free( ubuf ) ;
2798 #else
2799         res = byteOutLen / sizeof( UniChar ) ;
2800 #endif
2801
2802         if ( buf == NULL )
2803              free(tbuf) ;
2804
2805         if ( buf  && res < n)
2806             buf[res] = 0;
2807
2808         return res ;
2809     }
2810
2811     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2812     {
2813         OSStatus status = noErr ;
2814         ByteCount byteOutLen ;
2815         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2816
2817         char *tbuf = NULL ;
2818
2819         if (buf == NULL)
2820         {
2821             // Apple specs say at least 32
2822             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2823             tbuf = (char*) malloc( n ) ;
2824         }
2825
2826         ByteCount byteBufferLen = n ;
2827         UniChar* ubuf = NULL ;
2828
2829 #if SIZEOF_WCHAR_T == 4
2830         wxMBConvUTF16 converter ;
2831         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2832         byteInLen = unicharlen ;
2833         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2834         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2835 #else
2836         ubuf = (UniChar*) psz ;
2837 #endif
2838
2839         status = TECConvertText(
2840             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2841             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2842
2843 #if SIZEOF_WCHAR_T == 4
2844         free( ubuf ) ;
2845 #endif
2846
2847         if ( buf == NULL )
2848             free(tbuf) ;
2849
2850         size_t res = byteOutLen ;
2851         if ( buf  && res < n)
2852         {
2853             buf[res] = 0;
2854
2855             //we need to double-trip to verify it didn't insert any ? in place
2856             //of bogus characters
2857             wxWCharBuffer wcBuf(n);
2858             size_t pszlen = wxWcslen(psz);
2859             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2860                         wxWcslen(wcBuf) != pszlen ||
2861                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2862             {
2863                 // we didn't obtain the same thing we started from, hence
2864                 // the conversion was lossy and we consider that it failed
2865                 return wxCONV_FAILED;
2866             }
2867         }
2868
2869         return res ;
2870     }
2871
2872     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2873
2874     bool IsOk() const
2875         { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2876
2877 private:
2878     TECObjectRef m_MB2WC_converter;
2879     TECObjectRef m_WC2MB_converter;
2880
2881     TextEncodingBase m_char_encoding;
2882     TextEncodingBase m_unicode_encoding;
2883 };
2884
2885 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2886
2887 // ============================================================================
2888 // wxEncodingConverter based conversion classes
2889 // ============================================================================
2890
2891 #if wxUSE_FONTMAP
2892
2893 class wxMBConv_wxwin : public wxMBConv
2894 {
2895 private:
2896     void Init()
2897     {
2898         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2899                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2900     }
2901
2902 public:
2903     // temporarily just use wxEncodingConverter stuff,
2904     // so that it works while a better implementation is built
2905     wxMBConv_wxwin(const wxChar* name)
2906     {
2907         if (name)
2908             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2909         else
2910             m_enc = wxFONTENCODING_SYSTEM;
2911
2912         Init();
2913     }
2914
2915     wxMBConv_wxwin(wxFontEncoding enc)
2916     {
2917         m_enc = enc;
2918
2919         Init();
2920     }
2921
2922     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2923     {
2924         size_t inbuf = strlen(psz);
2925         if (buf)
2926         {
2927             if (!m2w.Convert(psz, buf))
2928                 return wxCONV_FAILED;
2929         }
2930         return inbuf;
2931     }
2932
2933     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2934     {
2935         const size_t inbuf = wxWcslen(psz);
2936         if (buf)
2937         {
2938             if (!w2m.Convert(psz, buf))
2939                 return wxCONV_FAILED;
2940         }
2941
2942         return inbuf;
2943     }
2944
2945     virtual size_t GetMBNulLen() const
2946     {
2947         switch ( m_enc )
2948         {
2949             case wxFONTENCODING_UTF16BE:
2950             case wxFONTENCODING_UTF16LE:
2951                 return 2;
2952
2953             case wxFONTENCODING_UTF32BE:
2954             case wxFONTENCODING_UTF32LE:
2955                 return 4;
2956
2957             default:
2958                 return 1;
2959         }
2960     }
2961
2962     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2963
2964     bool IsOk() const { return m_ok; }
2965
2966 public:
2967     wxFontEncoding m_enc;
2968     wxEncodingConverter m2w, w2m;
2969
2970 private:
2971     // were we initialized successfully?
2972     bool m_ok;
2973
2974     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2975 };
2976
2977 // make the constructors available for unit testing
2978 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2979 {
2980     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2981     if ( !result->IsOk() )
2982     {
2983         delete result;
2984         return 0;
2985     }
2986
2987     return result;
2988 }
2989
2990 #endif // wxUSE_FONTMAP
2991
2992 // ============================================================================
2993 // wxCSConv implementation
2994 // ============================================================================
2995
2996 void wxCSConv::Init()
2997 {
2998     m_name = NULL;
2999     m_convReal =  NULL;
3000     m_deferred = true;
3001 }
3002
3003 wxCSConv::wxCSConv(const wxChar *charset)
3004 {
3005     Init();
3006
3007     if ( charset )
3008     {
3009         SetName(charset);
3010     }
3011
3012 #if wxUSE_FONTMAP
3013     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3014 #else
3015     m_encoding = wxFONTENCODING_SYSTEM;
3016 #endif
3017 }
3018
3019 wxCSConv::wxCSConv(wxFontEncoding encoding)
3020 {
3021     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3022     {
3023         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3024
3025         encoding = wxFONTENCODING_SYSTEM;
3026     }
3027
3028     Init();
3029
3030     m_encoding = encoding;
3031 }
3032
3033 wxCSConv::~wxCSConv()
3034 {
3035     Clear();
3036 }
3037
3038 wxCSConv::wxCSConv(const wxCSConv& conv)
3039         : wxMBConv()
3040 {
3041     Init();
3042
3043     SetName(conv.m_name);
3044     m_encoding = conv.m_encoding;
3045 }
3046
3047 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3048 {
3049     Clear();
3050
3051     SetName(conv.m_name);
3052     m_encoding = conv.m_encoding;
3053
3054     return *this;
3055 }
3056
3057 void wxCSConv::Clear()
3058 {
3059     free(m_name);
3060     delete m_convReal;
3061
3062     m_name = NULL;
3063     m_convReal = NULL;
3064 }
3065
3066 void wxCSConv::SetName(const wxChar *charset)
3067 {
3068     if (charset)
3069     {
3070         m_name = wxStrdup(charset);
3071         m_deferred = true;
3072     }
3073 }
3074
3075 #if wxUSE_FONTMAP
3076 #include "wx/hashmap.h"
3077
3078 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3079                      wxEncodingNameCache );
3080
3081 static wxEncodingNameCache gs_nameCache;
3082 #endif
3083
3084 wxMBConv *wxCSConv::DoCreate() const
3085 {
3086 #if wxUSE_FONTMAP
3087     wxLogTrace(TRACE_STRCONV,
3088                wxT("creating conversion for %s"),
3089                (m_name ? m_name
3090                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3091 #endif // wxUSE_FONTMAP
3092
3093     // check for the special case of ASCII or ISO8859-1 charset: as we have
3094     // special knowledge of it anyhow, we don't need to create a special
3095     // conversion object
3096     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3097             m_encoding == wxFONTENCODING_DEFAULT )
3098     {
3099         // don't convert at all
3100         return NULL;
3101     }
3102
3103     // we trust OS to do conversion better than we can so try external
3104     // conversion methods first
3105     //
3106     // the full order is:
3107     //      1. OS conversion (iconv() under Unix or Win32 API)
3108     //      2. hard coded conversions for UTF
3109     //      3. wxEncodingConverter as fall back
3110
3111     // step (1)
3112 #ifdef HAVE_ICONV
3113 #if !wxUSE_FONTMAP
3114     if ( m_name )
3115 #endif // !wxUSE_FONTMAP
3116     {
3117         wxString name(m_name);
3118         wxFontEncoding encoding(m_encoding);
3119
3120         if ( !name.empty() )
3121         {
3122             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3123             if ( conv->IsOk() )
3124                 return conv;
3125
3126             delete conv;
3127
3128 #if wxUSE_FONTMAP
3129             encoding =
3130                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3131 #endif // wxUSE_FONTMAP
3132         }
3133 #if wxUSE_FONTMAP
3134         {
3135             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3136             if ( it != gs_nameCache.end() )
3137             {
3138                 if ( it->second.empty() )
3139                     return NULL;
3140
3141                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3142                 if ( conv->IsOk() )
3143                     return conv;
3144
3145                 delete conv;
3146             }
3147
3148             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3149
3150             for ( ; *names; ++names )
3151             {
3152                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3153                 if ( conv->IsOk() )
3154                 {
3155                     gs_nameCache[encoding] = *names;
3156                     return conv;
3157                 }
3158
3159                 delete conv;
3160             }
3161
3162             gs_nameCache[encoding] = _T(""); // cache the failure
3163         }
3164 #endif // wxUSE_FONTMAP
3165     }
3166 #endif // HAVE_ICONV
3167
3168 #ifdef wxHAVE_WIN32_MB2WC
3169     {
3170 #if wxUSE_FONTMAP
3171         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3172                                       : new wxMBConv_win32(m_encoding);
3173         if ( conv->IsOk() )
3174             return conv;
3175
3176         delete conv;
3177 #else
3178         return NULL;
3179 #endif
3180     }
3181 #endif // wxHAVE_WIN32_MB2WC
3182
3183 #if defined(__WXMAC__)
3184     {
3185         // leave UTF16 and UTF32 to the built-ins of wx
3186         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3187             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3188         {
3189 #if wxUSE_FONTMAP
3190             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3191                                         : new wxMBConv_mac(m_encoding);
3192 #else
3193             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3194 #endif
3195             if ( conv->IsOk() )
3196                  return conv;
3197
3198             delete conv;
3199         }
3200     }
3201 #endif
3202
3203 #if defined(__WXCOCOA__)
3204     {
3205         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3206         {
3207 #if wxUSE_FONTMAP
3208             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3209                                           : new wxMBConv_cocoa(m_encoding);
3210 #else
3211             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3212 #endif
3213
3214             if ( conv->IsOk() )
3215                  return conv;
3216
3217             delete conv;
3218         }
3219     }
3220 #endif
3221     // step (2)
3222     wxFontEncoding enc = m_encoding;
3223 #if wxUSE_FONTMAP
3224     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3225     {
3226         // use "false" to suppress interactive dialogs -- we can be called from
3227         // anywhere and popping up a dialog from here is the last thing we want to
3228         // do
3229         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3230     }
3231 #endif // wxUSE_FONTMAP
3232
3233     switch ( enc )
3234     {
3235         case wxFONTENCODING_UTF7:
3236              return new wxMBConvUTF7;
3237
3238         case wxFONTENCODING_UTF8:
3239              return new wxMBConvUTF8;
3240
3241         case wxFONTENCODING_UTF16BE:
3242              return new wxMBConvUTF16BE;
3243
3244         case wxFONTENCODING_UTF16LE:
3245              return new wxMBConvUTF16LE;
3246
3247         case wxFONTENCODING_UTF32BE:
3248              return new wxMBConvUTF32BE;
3249
3250         case wxFONTENCODING_UTF32LE:
3251              return new wxMBConvUTF32LE;
3252
3253         default:
3254              // nothing to do but put here to suppress gcc warnings
3255              break;
3256     }
3257
3258     // step (3)
3259 #if wxUSE_FONTMAP
3260     {
3261         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3262                                       : new wxMBConv_wxwin(m_encoding);
3263         if ( conv->IsOk() )
3264             return conv;
3265
3266         delete conv;
3267     }
3268 #endif // wxUSE_FONTMAP
3269
3270     // NB: This is a hack to prevent deadlock. What could otherwise happen
3271     //     in Unicode build: wxConvLocal creation ends up being here
3272     //     because of some failure and logs the error. But wxLog will try to
3273     //     attach timestamp, for which it will need wxConvLocal (to convert
3274     //     time to char* and then wchar_t*), but that fails, tries to log
3275     //     error, but wxLog has a (already locked) critical section that
3276     //     guards static buffer.
3277     static bool alreadyLoggingError = false;
3278     if (!alreadyLoggingError)
3279     {
3280         alreadyLoggingError = true;
3281         wxLogError(_("Cannot convert from the charset '%s'!"),
3282                    m_name ? m_name
3283                       :
3284 #if wxUSE_FONTMAP
3285                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3286 #else // !wxUSE_FONTMAP
3287                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3288 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3289               );
3290
3291         alreadyLoggingError = false;
3292     }
3293
3294     return NULL;
3295 }
3296
3297 void wxCSConv::CreateConvIfNeeded() const
3298 {
3299     if ( m_deferred )
3300     {
3301         wxCSConv *self = (wxCSConv *)this; // const_cast
3302
3303 #if wxUSE_INTL
3304         // if we don't have neither the name nor the encoding, use the default
3305         // encoding for this system
3306         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3307         {
3308             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3309         }
3310 #endif // wxUSE_INTL
3311
3312         self->m_convReal = DoCreate();
3313         self->m_deferred = false;
3314     }
3315 }
3316
3317 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3318 {
3319     CreateConvIfNeeded();
3320
3321     if (m_convReal)
3322         return m_convReal->MB2WC(buf, psz, n);
3323
3324     // latin-1 (direct)
3325     size_t len = strlen(psz);
3326
3327     if (buf)
3328     {
3329         for (size_t c = 0; c <= len; c++)
3330             buf[c] = (unsigned char)(psz[c]);
3331     }
3332
3333     return len;
3334 }
3335
3336 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3337 {
3338     CreateConvIfNeeded();
3339
3340     if (m_convReal)
3341         return m_convReal->WC2MB(buf, psz, n);
3342
3343     // latin-1 (direct)
3344     const size_t len = wxWcslen(psz);
3345     if (buf)
3346     {
3347         for (size_t c = 0; c <= len; c++)
3348         {
3349             if (psz[c] > 0xFF)
3350                 return wxCONV_FAILED;
3351
3352             buf[c] = (char)psz[c];
3353         }
3354     }
3355     else
3356     {
3357         for (size_t c = 0; c <= len; c++)
3358         {
3359             if (psz[c] > 0xFF)
3360                 return wxCONV_FAILED;
3361         }
3362     }
3363
3364     return len;
3365 }
3366
3367 size_t wxCSConv::GetMBNulLen() const
3368 {
3369     CreateConvIfNeeded();
3370
3371     if ( m_convReal )
3372     {
3373         return m_convReal->GetMBNulLen();
3374     }
3375
3376     return 1;
3377 }
3378
3379 // ----------------------------------------------------------------------------
3380 // globals
3381 // ----------------------------------------------------------------------------
3382
3383 #ifdef __WINDOWS__
3384     static wxMBConv_win32 wxConvLibcObj;
3385 #elif defined(__WXMAC__) && !defined(__MACH__)
3386     static wxMBConv_mac wxConvLibcObj ;
3387 #else
3388     static wxMBConvLibc wxConvLibcObj;
3389 #endif
3390
3391 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3392 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3393 static wxMBConvUTF7 wxConvUTF7Obj;
3394 static wxMBConvUTF8 wxConvUTF8Obj;
3395
3396 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3397 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3398 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3399 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3400 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3401 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3402 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3403 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3404 #ifdef __WXOSX__
3405                                     wxConvUTF8Obj;
3406 #else
3407                                     wxConvLibcObj;
3408 #endif
3409
3410 #else // !wxUSE_WCHAR_T
3411
3412 // stand-ins in absence of wchar_t
3413 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3414                                 wxConvISO8859_1,
3415                                 wxConvLocal,
3416                                 wxConvUTF8;
3417
3418 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T