src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifndef WX_PRECOMP
  19     #include "wx/intl.h"
  20     #include "wx/log.h"
  21 #endif
  22
  23 #include "wx/strconv.h"
  24
  25 #if wxUSE_WCHAR_T
  26
  27 #ifdef __WINDOWS__
  28     #include "wx/msw/private.h"
  29     #include "wx/msw/missing.h"
  30 #endif
  31
  32 #ifndef __WXWINCE__
  33 #include <errno.h>
  34 #endif
  35
  36 #include <ctype.h>
  37 #include <string.h>
  38 #include <stdlib.h>
  39
  40 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  41     #define wxHAVE_WIN32_MB2WC
  42 #endif
  43
  44 #ifdef __SALFORDC__
  45     #include <clib.h>
  46 #endif
  47
  48 #ifdef HAVE_ICONV
  49     #include <iconv.h>
  50     #include "wx/thread.h"
  51 #endif
  52
  53 #include "wx/encconv.h"
  54 #include "wx/fontmap.h"
  55 #include "wx/utils.h"
  56
  57 #ifdef __WXMAC__
  58 #ifndef __DARWIN__
  59 #include <ATSUnicode.h>
  60 #include <TextCommon.h>
  61 #include <TextEncodingConverter.h>
  62 #endif
  63
  64 // includes Mac headers
  65 #include "wx/mac/private.h"
  66 #endif
  67
  68
  69 #define TRACE_STRCONV _T("strconv")
  70
  71 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  72 // be 4 bytes
  73 #if SIZEOF_WCHAR_T == 2
  74     #define WC_UTF16
  75 #endif
  76
  77
  78 // ============================================================================
  79 // implementation
  80 // ============================================================================
  81
  82 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  83 static bool NotAllNULs(const char *p, size_t n)
  84 {
  85     while ( n && *p++ == '\0' )
  86         n--;
  87
  88     return n != 0;
  89 }
  90
  91 // ----------------------------------------------------------------------------
  92 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  93 // ----------------------------------------------------------------------------
  94
  95 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  96 {
  97     if (input <= 0xffff)
  98     {
  99         if (output)
 100             *output = (wxUint16) input;
 101
 102         return 1;
 103     }
 104     else if (input >= 0x110000)
 105     {
 106         return wxCONV_FAILED;
 107     }
 108     else
 109     {
 110         if (output)
 111         {
 112             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 113             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 114         }
 115
 116         return 2;
 117     }
 118 }
 119
 120 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 121 {
 122     if ((*input < 0xd800) || (*input > 0xdfff))
 123     {
 124         output = *input;
 125         return 1;
 126     }
 127     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 128     {
 129         output = *input;
 130         return wxCONV_FAILED;
 131     }
 132     else
 133     {
 134         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 135         return 2;
 136     }
 137 }
 138
 139 #ifdef WC_UTF16
 140     typedef wchar_t wxDecodeSurrogate_t;
 141 #else // !WC_UTF16
 142     typedef wxUint16 wxDecodeSurrogate_t;
 143 #endif // WC_UTF16/!WC_UTF16
 144
 145 // returns the next UTF-32 character from the wchar_t buffer and advances the
 146 // pointer to the character after this one
 147 //
 148 // if an invalid character is found, *pSrc is set to NULL, the caller must
 149 // check for this
 150 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 151 {
 152     wxUint32 out;
 153     const size_t n = decode_utf16(wx_reinterpret_cast(wxUint16 *, *pSrc), out);
 154     if ( n == wxCONV_FAILED )
 155         *pSrc = NULL;
 156     else
 157         *pSrc += n;
 158
 159     return out;
 160 }
 161
 162 // ----------------------------------------------------------------------------
 163 // wxMBConv
 164 // ----------------------------------------------------------------------------
 165
 166 size_t
 167 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 168                   const char *src, size_t srcLen) const
 169 {
 170     // although new conversion classes are supposed to implement this function
 171     // directly, the existins ones only implement the old MB2WC() and so, to
 172     // avoid to have to rewrite all conversion classes at once, we provide a
 173     // default (but not efficient) implementation of this one in terms of the
 174     // old function by copying the input to ensure that it's NUL-terminated and
 175     // then using MB2WC() to convert it
 176
 177     // the number of chars [which would be] written to dst [if it were not NULL]
 178     size_t dstWritten = 0;
 179
 180     // the number of NULs terminating this string
 181     size_t nulLen wxDUMMY_INITIALIZE(0);
 182
 183     // if we were not given the input size we just have to assume that the
 184     // string is properly terminated as we have no way of knowing how long it
 185     // is anyhow, but if we do have the size check whether there are enough
 186     // NULs at the end
 187     wxCharBuffer bufTmp;
 188     const char *srcEnd;
 189     if ( srcLen != wxNO_LEN )
 190     {
 191         // we need to know how to find the end of this string
 192         nulLen = GetMBNulLen();
 193         if ( nulLen == wxCONV_FAILED )
 194             return wxCONV_FAILED;
 195
 196         // if there are enough NULs we can avoid the copy
 197         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 198         {
 199             // make a copy in order to properly NUL-terminate the string
 200             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 201             char * const p = bufTmp.data();
 202             memcpy(p, src, srcLen);
 203             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 204                 *s = '\0';
 205
 206             src = bufTmp;
 207         }
 208
 209         srcEnd = src + srcLen;
 210     }
 211     else // quit after the first loop iteration
 212     {
 213         srcEnd = NULL;
 214     }
 215
 216     for ( ;; )
 217     {
 218         // try to convert the current chunk
 219         size_t lenChunk = MB2WC(NULL, src, 0);
 220         if ( lenChunk == wxCONV_FAILED )
 221             return wxCONV_FAILED;
 222
 223         lenChunk++; // for the L'\0' at the end of this chunk
 224
 225         dstWritten += lenChunk;
 226
 227         if ( lenChunk == 1 )
 228         {
 229             // nothing left in the input string, conversion succeeded
 230             break;
 231         }
 232
 233         if ( dst )
 234         {
 235             if ( dstWritten > dstLen )
 236                 return wxCONV_FAILED;
 237
 238             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 239                 return wxCONV_FAILED;
 240
 241             dst += lenChunk;
 242         }
 243
 244         if ( !srcEnd )
 245         {
 246             // we convert just one chunk in this case as this is the entire
 247             // string anyhow
 248             break;
 249         }
 250
 251         // advance the input pointer past the end of this chunk
 252         while ( NotAllNULs(src, nulLen) )
 253         {
 254             // notice that we must skip over multiple bytes here as we suppose
 255             // that if NUL takes 2 or 4 bytes, then all the other characters do
 256             // too and so if advanced by a single byte we might erroneously
 257             // detect sequences of NUL bytes in the middle of the input
 258             src += nulLen;
 259         }
 260
 261         src += nulLen; // skipping over its terminator as well
 262
 263         // note that ">=" (and not just "==") is needed here as the terminator
 264         // we skipped just above could be inside or just after the buffer
 265         // delimited by inEnd
 266         if ( src >= srcEnd )
 267             break;
 268     }
 269
 270     return dstWritten;
 271 }
 272
 273 size_t
 274 wxMBConv::FromWChar(char *dst, size_t dstLen,
 275                     const wchar_t *src, size_t srcLen) const
 276 {
 277     // the number of chars [which would be] written to dst [if it were not NULL]
 278     size_t dstWritten = 0;
 279
 280     // make a copy of the input string unless it is already properly
 281     // NUL-terminated
 282     //
 283     // if we don't know its length we have no choice but to assume that it is,
 284     // indeed, properly terminated
 285     wxWCharBuffer bufTmp;
 286     if ( srcLen == wxNO_LEN )
 287     {
 288         srcLen = wxWcslen(src) + 1;
 289     }
 290     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 291     {
 292         // make a copy in order to properly NUL-terminate the string
 293         bufTmp = wxWCharBuffer(srcLen);
 294         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 295         src = bufTmp;
 296     }
 297
 298     const size_t lenNul = GetMBNulLen();
 299     for ( const wchar_t * const srcEnd = src + srcLen;
 300           src < srcEnd;
 301           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 302     {
 303         // try to convert the current chunk
 304         size_t lenChunk = WC2MB(NULL, src, 0);
 305
 306         if ( lenChunk == wxCONV_FAILED )
 307             return wxCONV_FAILED;
 308
 309         lenChunk += lenNul;
 310         dstWritten += lenChunk;
 311
 312         if ( dst )
 313         {
 314             if ( dstWritten > dstLen )
 315                 return wxCONV_FAILED;
 316
 317             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 318                 return wxCONV_FAILED;
 319
 320             dst += lenChunk;
 321         }
 322     }
 323
 324     return dstWritten;
 325 }
 326
 327 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 328 {
 329     size_t rc = ToWChar(outBuff, outLen, inBuff);
 330     if ( rc != wxCONV_FAILED )
 331     {
 332         // ToWChar() returns the buffer length, i.e. including the trailing
 333         // NUL, while this method doesn't take it into account
 334         rc--;
 335     }
 336
 337     return rc;
 338 }
 339
 340 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 341 {
 342     size_t rc = FromWChar(outBuff, outLen, inBuff);
 343     if ( rc != wxCONV_FAILED )
 344     {
 345         rc -= GetMBNulLen();
 346     }
 347
 348     return rc;
 349 }
 350
 351 wxMBConv::~wxMBConv()
 352 {
 353     // nothing to do here (necessary for Darwin linking probably)
 354 }
 355
 356 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 357 {
 358     if ( psz )
 359     {
 360         // calculate the length of the buffer needed first
 361         const size_t nLen = MB2WC(NULL, psz, 0);
 362         if ( nLen != wxCONV_FAILED )
 363         {
 364             // now do the actual conversion
 365             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 366
 367             // +1 for the trailing NULL
 368             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 369                 return buf;
 370         }
 371     }
 372
 373     return wxWCharBuffer();
 374 }
 375
 376 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 377 {
 378     if ( pwz )
 379     {
 380         const size_t nLen = WC2MB(NULL, pwz, 0);
 381         if ( nLen != wxCONV_FAILED )
 382         {
 383             // extra space for trailing NUL(s)
 384             static const size_t extraLen = GetMaxMBNulLen();
 385
 386             wxCharBuffer buf(nLen + extraLen - 1);
 387             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 388                 return buf;
 389         }
 390     }
 391
 392     return wxCharBuffer();
 393 }
 394
 395 const wxWCharBuffer
 396 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 397 {
 398     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 399     if ( dstLen != wxCONV_FAILED )
 400     {
 401         wxWCharBuffer wbuf(dstLen - 1);
 402         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 403         {
 404             if ( outLen )
 405             {
 406                 *outLen = dstLen;
 407                 if ( wbuf[dstLen - 1] == L'\0' )
 408                     (*outLen)--;
 409             }
 410
 411             return wbuf;
 412         }
 413     }
 414
 415     if ( outLen )
 416         *outLen = 0;
 417
 418     return wxWCharBuffer();
 419 }
 420
 421 const wxCharBuffer
 422 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 423 {
 424     const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 425     if ( dstLen != wxCONV_FAILED )
 426     {
 427         wxCharBuffer buf(dstLen - 1);
 428         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 429         {
 430             if ( outLen )
 431             {
 432                 *outLen = dstLen;
 433
 434                 const size_t nulLen = GetMBNulLen();
 435                 if ( !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 436                 {
 437                     // in this case the output is NUL-terminated and we're not
 438                     // supposed to count NUL
 439                     (*outLen) -= nulLen;
 440                 }
 441             }
 442
 443             return buf;
 444         }
 445     }
 446
 447     if ( outLen )
 448         *outLen = 0;
 449
 450     return wxCharBuffer();
 451 }
 452
 453 // ----------------------------------------------------------------------------
 454 // wxMBConvLibc
 455 // ----------------------------------------------------------------------------
 456
 457 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 458 {
 459     return wxMB2WC(buf, psz, n);
 460 }
 461
 462 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 463 {
 464     return wxWC2MB(buf, psz, n);
 465 }
 466
 467 // ----------------------------------------------------------------------------
 468 // wxConvBrokenFileNames
 469 // ----------------------------------------------------------------------------
 470
 471 #ifdef __UNIX__
 472
 473 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 474 {
 475     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 476                   || wxStricmp(charset, _T("UTF8")) == 0  )
 477         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 478     else
 479         m_conv = new wxCSConv(charset);
 480 }
 481
 482 #endif // __UNIX__
 483
 484 // ----------------------------------------------------------------------------
 485 // UTF-7
 486 // ----------------------------------------------------------------------------
 487
 488 // Implementation (C) 2004 Fredrik Roubert
 489
 490 //
 491 // BASE64 decoding table
 492 //
 493 static const unsigned char utf7unb64[] =
 494 {
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 501     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 502     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 504     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 505     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 506     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 508     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 509     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 510     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 527 };
 528
 529 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 530 {
 531     size_t len = 0;
 532
 533     while ( *psz && (!buf || (len < n)) )
 534     {
 535         unsigned char cc = *psz++;
 536         if (cc != '+')
 537         {
 538             // plain ASCII char
 539             if (buf)
 540                 *buf++ = cc;
 541             len++;
 542         }
 543         else if (*psz == '-')
 544         {
 545             // encoded plus sign
 546             if (buf)
 547                 *buf++ = cc;
 548             len++;
 549             psz++;
 550         }
 551         else // start of BASE64 encoded string
 552         {
 553             bool lsb, ok;
 554             unsigned int d, l;
 555             for ( ok = lsb = false, d = 0, l = 0;
 556                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 557                   psz++ )
 558             {
 559                 d <<= 6;
 560                 d += cc;
 561                 for (l += 6; l >= 8; lsb = !lsb)
 562                 {
 563                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 564                     if (lsb)
 565                     {
 566                         if (buf)
 567                             *buf++ |= c;
 568                         len ++;
 569                     }
 570                     else
 571                     {
 572                         if (buf)
 573                             *buf = (wchar_t)(c << 8);
 574                     }
 575
 576                     ok = true;
 577                 }
 578             }
 579
 580             if ( !ok )
 581             {
 582                 // in valid UTF7 we should have valid characters after '+'
 583                 return wxCONV_FAILED;
 584             }
 585
 586             if (*psz == '-')
 587                 psz++;
 588         }
 589     }
 590
 591     if ( buf && (len < n) )
 592         *buf = '\0';
 593
 594     return len;
 595 }
 596
 597 //
 598 // BASE64 encoding table
 599 //
 600 static const unsigned char utf7enb64[] =
 601 {
 602     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 603     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 604     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 605     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 606     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 607     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 608     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 609     '4', '5', '6', '7', '8', '9', '+', '/'
 610 };
 611
 612 //
 613 // UTF-7 encoding table
 614 //
 615 // 0 - Set D (directly encoded characters)
 616 // 1 - Set O (optional direct characters)
 617 // 2 - whitespace characters (optional)
 618 // 3 - special characters
 619 //
 620 static const unsigned char utf7encode[128] =
 621 {
 622     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 623     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 624     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 625     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 626     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 627     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 628     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 629     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 630 };
 631
 632 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 633 {
 634     size_t len = 0;
 635
 636     while (*psz && ((!buf) || (len < n)))
 637     {
 638         wchar_t cc = *psz++;
 639         if (cc < 0x80 && utf7encode[cc] < 1)
 640         {
 641             // plain ASCII char
 642             if (buf)
 643                 *buf++ = (char)cc;
 644
 645             len++;
 646         }
 647 #ifndef WC_UTF16
 648         else if (((wxUint32)cc) > 0xffff)
 649         {
 650             // no surrogate pair generation (yet?)
 651             return wxCONV_FAILED;
 652         }
 653 #endif
 654         else
 655         {
 656             if (buf)
 657                 *buf++ = '+';
 658
 659             len++;
 660             if (cc != '+')
 661             {
 662                 // BASE64 encode string
 663                 unsigned int lsb, d, l;
 664                 for (d = 0, l = 0; /*nothing*/; psz++)
 665                 {
 666                     for (lsb = 0; lsb < 2; lsb ++)
 667                     {
 668                         d <<= 8;
 669                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 670
 671                         for (l += 8; l >= 6; )
 672                         {
 673                             l -= 6;
 674                             if (buf)
 675                                 *buf++ = utf7enb64[(d >> l) % 64];
 676                             len++;
 677                         }
 678                     }
 679
 680                     cc = *psz;
 681                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 682                         break;
 683                 }
 684
 685                 if (l != 0)
 686                 {
 687                     if (buf)
 688                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 689
 690                     len++;
 691                 }
 692             }
 693
 694             if (buf)
 695                 *buf++ = '-';
 696             len++;
 697         }
 698     }
 699
 700     if (buf && (len < n))
 701         *buf = 0;
 702
 703     return len;
 704 }
 705
 706 // ----------------------------------------------------------------------------
 707 // UTF-8
 708 // ----------------------------------------------------------------------------
 709
 710 static wxUint32 utf8_max[]=
 711     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 712
 713 // boundaries of the private use area we use to (temporarily) remap invalid
 714 // characters invalid in a UTF-8 encoded string
 715 const wxUint32 wxUnicodePUA = 0x100000;
 716 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 717
 718 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 719 {
 720     size_t len = 0;
 721
 722     while (*psz && ((!buf) || (len < n)))
 723     {
 724         const char *opsz = psz;
 725         bool invalid = false;
 726         unsigned char cc = *psz++, fc = cc;
 727         unsigned cnt;
 728         for (cnt = 0; fc & 0x80; cnt++)
 729             fc <<= 1;
 730
 731         if (!cnt)
 732         {
 733             // plain ASCII char
 734             if (buf)
 735                 *buf++ = cc;
 736             len++;
 737
 738             // escape the escape character for octal escapes
 739             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 740                     && cc == '\\' && (!buf || len < n))
 741             {
 742                 if (buf)
 743                     *buf++ = cc;
 744                 len++;
 745             }
 746         }
 747         else
 748         {
 749             cnt--;
 750             if (!cnt)
 751             {
 752                 // invalid UTF-8 sequence
 753                 invalid = true;
 754             }
 755             else
 756             {
 757                 unsigned ocnt = cnt - 1;
 758                 wxUint32 res = cc & (0x3f >> cnt);
 759                 while (cnt--)
 760                 {
 761                     cc = *psz;
 762                     if ((cc & 0xC0) != 0x80)
 763                     {
 764                         // invalid UTF-8 sequence
 765                         invalid = true;
 766                         break;
 767                     }
 768
 769                     psz++;
 770                     res = (res << 6) | (cc & 0x3f);
 771                 }
 772
 773                 if (invalid || res <= utf8_max[ocnt])
 774                 {
 775                     // illegal UTF-8 encoding
 776                     invalid = true;
 777                 }
 778                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 779                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 780                 {
 781                     // if one of our PUA characters turns up externally
 782                     // it must also be treated as an illegal sequence
 783                     // (a bit like you have to escape an escape character)
 784                     invalid = true;
 785                 }
 786                 else
 787                 {
 788 #ifdef WC_UTF16
 789                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 790                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 791                     if (pa == wxCONV_FAILED)
 792                     {
 793                         invalid = true;
 794                     }
 795                     else
 796                     {
 797                         if (buf)
 798                             buf += pa;
 799                         len += pa;
 800                     }
 801 #else // !WC_UTF16
 802                     if (buf)
 803                         *buf++ = (wchar_t)res;
 804                     len++;
 805 #endif // WC_UTF16/!WC_UTF16
 806                 }
 807             }
 808
 809             if (invalid)
 810             {
 811                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 812                 {
 813                     while (opsz < psz && (!buf || len < n))
 814                     {
 815 #ifdef WC_UTF16
 816                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 817                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 818                         wxASSERT(pa != wxCONV_FAILED);
 819                         if (buf)
 820                             buf += pa;
 821                         opsz++;
 822                         len += pa;
 823 #else
 824                         if (buf)
 825                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 826                         opsz++;
 827                         len++;
 828 #endif
 829                     }
 830                 }
 831                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 832                 {
 833                     while (opsz < psz && (!buf || len < n))
 834                     {
 835                         if ( buf && len + 3 < n )
 836                         {
 837                             unsigned char on = *opsz;
 838                             *buf++ = L'\\';
 839                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 840                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 841                             *buf++ = (wchar_t)( L'0' + on % 010 );
 842                         }
 843
 844                         opsz++;
 845                         len += 4;
 846                     }
 847                 }
 848                 else // MAP_INVALID_UTF8_NOT
 849                 {
 850                     return wxCONV_FAILED;
 851                 }
 852             }
 853         }
 854     }
 855
 856     if (buf && (len < n))
 857         *buf = 0;
 858
 859     return len;
 860 }
 861
 862 static inline bool isoctal(wchar_t wch)
 863 {
 864     return L'0' <= wch && wch <= L'7';
 865 }
 866
 867 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 868 {
 869     size_t len = 0;
 870
 871     while (*psz && ((!buf) || (len < n)))
 872     {
 873         wxUint32 cc;
 874
 875 #ifdef WC_UTF16
 876         // cast is ok for WC_UTF16
 877         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 878         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 879 #else
 880         cc = (*psz++) & 0x7fffffff;
 881 #endif
 882
 883         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 884                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 885         {
 886             if (buf)
 887                 *buf++ = (char)(cc - wxUnicodePUA);
 888             len++;
 889         }
 890         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 891                     && cc == L'\\' && psz[0] == L'\\' )
 892         {
 893             if (buf)
 894                 *buf++ = (char)cc;
 895             psz++;
 896             len++;
 897         }
 898         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 899                     cc == L'\\' &&
 900                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 901         {
 902             if (buf)
 903             {
 904                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 905                                  (psz[1] - L'0') * 010 +
 906                                  (psz[2] - L'0'));
 907             }
 908
 909             psz += 3;
 910             len++;
 911         }
 912         else
 913         {
 914             unsigned cnt;
 915             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 916             {
 917             }
 918
 919             if (!cnt)
 920             {
 921                 // plain ASCII char
 922                 if (buf)
 923                     *buf++ = (char) cc;
 924                 len++;
 925             }
 926             else
 927             {
 928                 len += cnt + 1;
 929                 if (buf)
 930                 {
 931                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 932                     while (cnt--)
 933                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 934                 }
 935             }
 936         }
 937     }
 938
 939     if (buf && (len < n))
 940         *buf = 0;
 941
 942     return len;
 943 }
 944
 945 // ============================================================================
 946 // UTF-16
 947 // ============================================================================
 948
 949 #ifdef WORDS_BIGENDIAN
 950     #define wxMBConvUTF16straight wxMBConvUTF16BE
 951     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 952 #else
 953     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 954     #define wxMBConvUTF16straight wxMBConvUTF16LE
 955 #endif
 956
 957 /* static */
 958 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 959 {
 960     if ( srcLen == wxNO_LEN )
 961     {
 962         // count the number of bytes in input, including the trailing NULs
 963         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 964         for ( srcLen = 1; *inBuff++; srcLen++ )
 965             ;
 966
 967         srcLen *= BYTES_PER_CHAR;
 968     }
 969     else // we already have the length
 970     {
 971         // we can only convert an entire number of UTF-16 characters
 972         if ( srcLen % BYTES_PER_CHAR )
 973             return wxCONV_FAILED;
 974     }
 975
 976     return srcLen;
 977 }
 978
 979 // case when in-memory representation is UTF-16 too
 980 #ifdef WC_UTF16
 981
 982 // ----------------------------------------------------------------------------
 983 // conversions without endianness change
 984 // ----------------------------------------------------------------------------
 985
 986 size_t
 987 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 988                                const char *src, size_t srcLen) const
 989 {
 990     // set up the scene for using memcpy() (which is presumably more efficient
 991     // than copying the bytes one by one)
 992     srcLen = GetLength(src, srcLen);
 993     if ( srcLen == wxNO_LEN )
 994         return wxCONV_FAILED;
 995
 996     const size_t inLen = srcLen / BYTES_PER_CHAR;
 997     if ( dst )
 998     {
 999         if ( dstLen < inLen )
1000             return wxCONV_FAILED;
1001
1002         memcpy(dst, src, srcLen);
1003     }
1004
1005     return inLen;
1006 }
1007
1008 size_t
1009 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1010                                  const wchar_t *src, size_t srcLen) const
1011 {
1012     if ( srcLen == wxNO_LEN )
1013         srcLen = wxWcslen(src) + 1;
1014
1015     srcLen *= BYTES_PER_CHAR;
1016
1017     if ( dst )
1018     {
1019         if ( dstLen < srcLen )
1020             return wxCONV_FAILED;
1021
1022         memcpy(dst, src, srcLen);
1023     }
1024
1025     return srcLen;
1026 }
1027
1028 // ----------------------------------------------------------------------------
1029 // endian-reversing conversions
1030 // ----------------------------------------------------------------------------
1031
1032 size_t
1033 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1034                            const char *src, size_t srcLen) const
1035 {
1036     srcLen = GetLength(src, srcLen);
1037     if ( srcLen == wxNO_LEN )
1038         return wxCONV_FAILED;
1039
1040     srcLen /= BYTES_PER_CHAR;
1041
1042     if ( dst )
1043     {
1044         if ( dstLen < srcLen )
1045             return wxCONV_FAILED;
1046
1047         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1048         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1049         {
1050             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1051         }
1052     }
1053
1054     return srcLen;
1055 }
1056
1057 size_t
1058 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1059                              const wchar_t *src, size_t srcLen) const
1060 {
1061     if ( srcLen == wxNO_LEN )
1062         srcLen = wxWcslen(src) + 1;
1063
1064     srcLen *= BYTES_PER_CHAR;
1065
1066     if ( dst )
1067     {
1068         if ( dstLen < srcLen )
1069             return wxCONV_FAILED;
1070
1071         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1072         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1073         {
1074             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1075         }
1076     }
1077
1078     return srcLen;
1079 }
1080
1081 #else // !WC_UTF16: wchar_t is UTF-32
1082
1083 // ----------------------------------------------------------------------------
1084 // conversions without endianness change
1085 // ----------------------------------------------------------------------------
1086
1087 size_t
1088 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1089                                const char *src, size_t srcLen) const
1090 {
1091     srcLen = GetLength(src, srcLen);
1092     if ( srcLen == wxNO_LEN )
1093         return wxCONV_FAILED;
1094
1095     const size_t inLen = srcLen / BYTES_PER_CHAR;
1096     if ( !dst )
1097     {
1098         // optimization: return maximal space which could be needed for this
1099         // string even if the real size could be smaller if the buffer contains
1100         // any surrogates
1101         return inLen;
1102     }
1103
1104     size_t outLen = 0;
1105     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1106     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1107     {
1108         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1109         if ( !inBuff )
1110             return wxCONV_FAILED;
1111
1112         if ( ++outLen > dstLen )
1113             return wxCONV_FAILED;
1114
1115         *dst++ = ch;
1116     }
1117
1118
1119     return outLen;
1120 }
1121
1122 size_t
1123 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1124                                  const wchar_t *src, size_t srcLen) const
1125 {
1126     if ( srcLen == wxNO_LEN )
1127         srcLen = wxWcslen(src) + 1;
1128
1129     size_t outLen = 0;
1130     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1131     for ( size_t n = 0; n < srcLen; n++ )
1132     {
1133         wxUint16 cc[2];
1134         const size_t numChars = encode_utf16(*src++, cc);
1135         if ( numChars == wxCONV_FAILED )
1136             return wxCONV_FAILED;
1137
1138         outLen += numChars * BYTES_PER_CHAR;
1139         if ( outBuff )
1140         {
1141             if ( outLen > dstLen )
1142                 return wxCONV_FAILED;
1143
1144             *outBuff++ = cc[0];
1145             if ( numChars == 2 )
1146             {
1147                 // second character of a surrogate
1148                 *outBuff++ = cc[1];
1149             }
1150         }
1151     }
1152
1153     return outLen;
1154 }
1155
1156 // ----------------------------------------------------------------------------
1157 // endian-reversing conversions
1158 // ----------------------------------------------------------------------------
1159
1160 size_t
1161 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1162                            const char *src, size_t srcLen) const
1163 {
1164     srcLen = GetLength(src, srcLen);
1165     if ( srcLen == wxNO_LEN )
1166         return wxCONV_FAILED;
1167
1168     const size_t inLen = srcLen / BYTES_PER_CHAR;
1169     if ( !dst )
1170     {
1171         // optimization: return maximal space which could be needed for this
1172         // string even if the real size could be smaller if the buffer contains
1173         // any surrogates
1174         return inLen;
1175     }
1176
1177     size_t outLen = 0;
1178     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1179     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1180     {
1181         wxUint32 ch;
1182         wxUint16 tmp[2];
1183
1184         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1185         inBuff++;
1186         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1187
1188         const size_t numChars = decode_utf16(tmp, ch);
1189         if ( numChars == wxCONV_FAILED )
1190             return wxCONV_FAILED;
1191
1192         if ( numChars == 2 )
1193             inBuff++;
1194
1195         if ( ++outLen > dstLen )
1196             return wxCONV_FAILED;
1197
1198         *dst++ = ch;
1199     }
1200
1201
1202     return outLen;
1203 }
1204
1205 size_t
1206 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1207                              const wchar_t *src, size_t srcLen) const
1208 {
1209     if ( srcLen == wxNO_LEN )
1210         srcLen = wxWcslen(src) + 1;
1211
1212     size_t outLen = 0;
1213     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1214     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1215     {
1216         wxUint16 cc[2];
1217         const size_t numChars = encode_utf16(*src, cc);
1218         if ( numChars == wxCONV_FAILED )
1219             return wxCONV_FAILED;
1220
1221         outLen += numChars * BYTES_PER_CHAR;
1222         if ( outBuff )
1223         {
1224             if ( outLen > dstLen )
1225                 return wxCONV_FAILED;
1226
1227             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1228             if ( numChars == 2 )
1229             {
1230                 // second character of a surrogate
1231                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1232             }
1233         }
1234     }
1235
1236     return outLen;
1237 }
1238
1239 #endif // WC_UTF16/!WC_UTF16
1240
1241
1242 // ============================================================================
1243 // UTF-32
1244 // ============================================================================
1245
1246 #ifdef WORDS_BIGENDIAN
1247     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1248     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1249 #else
1250     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1251     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1252 #endif
1253
1254
1255 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1256 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1257
1258 /* static */
1259 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1260 {
1261     if ( srcLen == wxNO_LEN )
1262     {
1263         // count the number of bytes in input, including the trailing NULs
1264         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1265         for ( srcLen = 1; *inBuff++; srcLen++ )
1266             ;
1267
1268         srcLen *= BYTES_PER_CHAR;
1269     }
1270     else // we already have the length
1271     {
1272         // we can only convert an entire number of UTF-32 characters
1273         if ( srcLen % BYTES_PER_CHAR )
1274             return wxCONV_FAILED;
1275     }
1276
1277     return srcLen;
1278 }
1279
1280 // case when in-memory representation is UTF-16
1281 #ifdef WC_UTF16
1282
1283 // ----------------------------------------------------------------------------
1284 // conversions without endianness change
1285 // ----------------------------------------------------------------------------
1286
1287 size_t
1288 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1289                                const char *src, size_t srcLen) const
1290 {
1291     srcLen = GetLength(src, srcLen);
1292     if ( srcLen == wxNO_LEN )
1293         return wxCONV_FAILED;
1294
1295     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1296     const size_t inLen = srcLen / BYTES_PER_CHAR;
1297     size_t outLen = 0;
1298     for ( size_t n = 0; n < inLen; n++ )
1299     {
1300         wxUint16 cc[2];
1301         const size_t numChars = encode_utf16(*inBuff++, cc);
1302         if ( numChars == wxCONV_FAILED )
1303             return wxCONV_FAILED;
1304
1305         outLen += numChars;
1306         if ( dst )
1307         {
1308             if ( outLen > dstLen )
1309                 return wxCONV_FAILED;
1310
1311             *dst++ = cc[0];
1312             if ( numChars == 2 )
1313             {
1314                 // second character of a surrogate
1315                 *dst++ = cc[1];
1316             }
1317         }
1318     }
1319
1320     return outLen;
1321 }
1322
1323 size_t
1324 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1325                                  const wchar_t *src, size_t srcLen) const
1326 {
1327     if ( srcLen == wxNO_LEN )
1328         srcLen = wxWcslen(src) + 1;
1329
1330     if ( !dst )
1331     {
1332         // optimization: return maximal space which could be needed for this
1333         // string instead of the exact amount which could be less if there are
1334         // any surrogates in the input
1335         //
1336         // we consider that surrogates are rare enough to make it worthwhile to
1337         // avoid running the loop below at the cost of slightly extra memory
1338         // consumption
1339         return srcLen * BYTES_PER_CHAR;
1340     }
1341
1342     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1343     size_t outLen = 0;
1344     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1345     {
1346         const wxUint32 ch = wxDecodeSurrogate(&src);
1347         if ( !src )
1348             return wxCONV_FAILED;
1349
1350         outLen += BYTES_PER_CHAR;
1351
1352         if ( outLen > dstLen )
1353             return wxCONV_FAILED;
1354
1355         *outBuff++ = ch;
1356     }
1357
1358     return outLen;
1359 }
1360
1361 // ----------------------------------------------------------------------------
1362 // endian-reversing conversions
1363 // ----------------------------------------------------------------------------
1364
1365 size_t
1366 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1367                            const char *src, size_t srcLen) const
1368 {
1369     srcLen = GetLength(src, srcLen);
1370     if ( srcLen == wxNO_LEN )
1371         return wxCONV_FAILED;
1372
1373     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1374     const size_t inLen = srcLen / BYTES_PER_CHAR;
1375     size_t outLen = 0;
1376     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1377     {
1378         wxUint16 cc[2];
1379         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1380         if ( numChars == wxCONV_FAILED )
1381             return wxCONV_FAILED;
1382
1383         outLen += numChars;
1384         if ( dst )
1385         {
1386             if ( outLen > dstLen )
1387                 return wxCONV_FAILED;
1388
1389             *dst++ = cc[0];
1390             if ( numChars == 2 )
1391             {
1392                 // second character of a surrogate
1393                 *dst++ = cc[1];
1394             }
1395         }
1396     }
1397
1398     return outLen;
1399 }
1400
1401 size_t
1402 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1403                              const wchar_t *src, size_t srcLen) const
1404 {
1405     if ( srcLen == wxNO_LEN )
1406         srcLen = wxWcslen(src) + 1;
1407
1408     if ( !dst )
1409     {
1410         // optimization: return maximal space which could be needed for this
1411         // string instead of the exact amount which could be less if there are
1412         // any surrogates in the input
1413         //
1414         // we consider that surrogates are rare enough to make it worthwhile to
1415         // avoid running the loop below at the cost of slightly extra memory
1416         // consumption
1417         return srcLen*BYTES_PER_CHAR;
1418     }
1419
1420     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1421     size_t outLen = 0;
1422     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1423     {
1424         const wxUint32 ch = wxDecodeSurrogate(&src);
1425         if ( !src )
1426             return wxCONV_FAILED;
1427
1428         outLen += BYTES_PER_CHAR;
1429
1430         if ( outLen > dstLen )
1431             return wxCONV_FAILED;
1432
1433         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1434     }
1435
1436     return outLen;
1437 }
1438
1439 #else // !WC_UTF16: wchar_t is UTF-32
1440
1441 // ----------------------------------------------------------------------------
1442 // conversions without endianness change
1443 // ----------------------------------------------------------------------------
1444
1445 size_t
1446 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1447                                const char *src, size_t srcLen) const
1448 {
1449     // use memcpy() as it should be much faster than hand-written loop
1450     srcLen = GetLength(src, srcLen);
1451     if ( srcLen == wxNO_LEN )
1452         return wxCONV_FAILED;
1453
1454     const size_t inLen = srcLen/BYTES_PER_CHAR;
1455     if ( dst )
1456     {
1457         if ( dstLen < inLen )
1458             return wxCONV_FAILED;
1459
1460         memcpy(dst, src, srcLen);
1461     }
1462
1463     return inLen;
1464 }
1465
1466 size_t
1467 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1468                                  const wchar_t *src, size_t srcLen) const
1469 {
1470     if ( srcLen == wxNO_LEN )
1471         srcLen = wxWcslen(src) + 1;
1472
1473     srcLen *= BYTES_PER_CHAR;
1474
1475     if ( dst )
1476     {
1477         if ( dstLen < srcLen )
1478             return wxCONV_FAILED;
1479
1480         memcpy(dst, src, srcLen);
1481     }
1482
1483     return srcLen;
1484 }
1485
1486 // ----------------------------------------------------------------------------
1487 // endian-reversing conversions
1488 // ----------------------------------------------------------------------------
1489
1490 size_t
1491 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1492                            const char *src, size_t srcLen) const
1493 {
1494     srcLen = GetLength(src, srcLen);
1495     if ( srcLen == wxNO_LEN )
1496         return wxCONV_FAILED;
1497
1498     srcLen /= BYTES_PER_CHAR;
1499
1500     if ( dst )
1501     {
1502         if ( dstLen < srcLen )
1503             return wxCONV_FAILED;
1504
1505         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1506         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1507         {
1508             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1509         }
1510     }
1511
1512     return srcLen;
1513 }
1514
1515 size_t
1516 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1517                              const wchar_t *src, size_t srcLen) const
1518 {
1519     if ( srcLen == wxNO_LEN )
1520         srcLen = wxWcslen(src) + 1;
1521
1522     srcLen *= BYTES_PER_CHAR;
1523
1524     if ( dst )
1525     {
1526         if ( dstLen < srcLen )
1527             return wxCONV_FAILED;
1528
1529         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1530         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1531         {
1532             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1533         }
1534     }
1535
1536     return srcLen;
1537 }
1538
1539 #endif // WC_UTF16/!WC_UTF16
1540
1541
1542 // ============================================================================
1543 // The classes doing conversion using the iconv_xxx() functions
1544 // ============================================================================
1545
1546 #ifdef HAVE_ICONV
1547
1548 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1549 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1550 //     (unless there's yet another bug in glibc) the only case when iconv()
1551 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1552 //     left in the input buffer -- when _real_ error occurs,
1553 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1554 //     iconv() failure.
1555 //     [This bug does not appear in glibc 2.2.]
1556 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1557 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1558                                      (errno != E2BIG || bufLeft != 0))
1559 #else
1560 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1561 #endif
1562
1563 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1564
1565 #define ICONV_T_INVALID ((iconv_t)-1)
1566
1567 #if SIZEOF_WCHAR_T == 4
1568     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1569     #define WC_ENC      wxFONTENCODING_UTF32
1570 #elif SIZEOF_WCHAR_T == 2
1571     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1572     #define WC_ENC      wxFONTENCODING_UTF16
1573 #else // sizeof(wchar_t) != 2 nor 4
1574     // does this ever happen?
1575     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1576 #endif
1577
1578 // ----------------------------------------------------------------------------
1579 // wxMBConv_iconv: encapsulates an iconv character set
1580 // ----------------------------------------------------------------------------
1581
1582 class wxMBConv_iconv : public wxMBConv
1583 {
1584 public:
1585     wxMBConv_iconv(const wxChar *name);
1586     virtual ~wxMBConv_iconv();
1587
1588     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1589     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1590
1591     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1592     virtual size_t GetMBNulLen() const;
1593
1594     virtual wxMBConv *Clone() const
1595     {
1596         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1597         p->m_minMBCharWidth = m_minMBCharWidth;
1598         return p;
1599     }
1600
1601     bool IsOk() const
1602         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1603
1604 protected:
1605     // the iconv handlers used to translate from multibyte
1606     // to wide char and in the other direction
1607     iconv_t m2w,
1608             w2m;
1609
1610 #if wxUSE_THREADS
1611     // guards access to m2w and w2m objects
1612     wxMutex m_iconvMutex;
1613 #endif
1614
1615 private:
1616     // the name (for iconv_open()) of a wide char charset -- if none is
1617     // available on this machine, it will remain NULL
1618     static wxString ms_wcCharsetName;
1619
1620     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1621     // different endian-ness than the native one
1622     static bool ms_wcNeedsSwap;
1623
1624
1625     // name of the encoding handled by this conversion
1626     wxString m_name;
1627
1628     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1629     // initially
1630     size_t m_minMBCharWidth;
1631 };
1632
1633 // make the constructor available for unit testing
1634 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1635 {
1636     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1637     if ( !result->IsOk() )
1638     {
1639         delete result;
1640         return 0;
1641     }
1642
1643     return result;
1644 }
1645
1646 wxString wxMBConv_iconv::ms_wcCharsetName;
1647 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1648
1649 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1650               : m_name(name)
1651 {
1652     m_minMBCharWidth = 0;
1653
1654     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1655     // names for the charsets
1656     const wxCharBuffer cname(wxString(name).ToAscii());
1657
1658     // check for charset that represents wchar_t:
1659     if ( ms_wcCharsetName.empty() )
1660     {
1661         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1662
1663 #if wxUSE_FONTMAP
1664         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1665 #else // !wxUSE_FONTMAP
1666         static const wxChar *names[] =
1667         {
1668 #if SIZEOF_WCHAR_T == 4
1669             _T("UCS-4"),
1670 #elif SIZEOF_WCHAR_T = 2
1671             _T("UCS-2"),
1672 #endif
1673             NULL
1674         };
1675 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1676
1677         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1678         {
1679             const wxString nameCS(*names);
1680
1681             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1682             wxString nameXE(nameCS);
1683
1684 #ifdef WORDS_BIGENDIAN
1685                 nameXE += _T("BE");
1686 #else // little endian
1687                 nameXE += _T("LE");
1688 #endif
1689
1690             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1691                        nameXE.c_str());
1692
1693             m2w = iconv_open(nameXE.ToAscii(), cname);
1694             if ( m2w == ICONV_T_INVALID )
1695             {
1696                 // try charset w/o bytesex info (e.g. "UCS4")
1697                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1698                            nameCS.c_str());
1699                 m2w = iconv_open(nameCS.ToAscii(), cname);
1700
1701                 // and check for bytesex ourselves:
1702                 if ( m2w != ICONV_T_INVALID )
1703                 {
1704                     char    buf[2], *bufPtr;
1705                     wchar_t wbuf[2], *wbufPtr;
1706                     size_t  insz, outsz;
1707                     size_t  res;
1708
1709                     buf[0] = 'A';
1710                     buf[1] = 0;
1711                     wbuf[0] = 0;
1712                     insz = 2;
1713                     outsz = SIZEOF_WCHAR_T * 2;
1714                     wbufPtr = wbuf;
1715                     bufPtr = buf;
1716
1717                     res = iconv(
1718                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1719                         (char**)&wbufPtr, &outsz);
1720
1721                     if (ICONV_FAILED(res, insz))
1722                     {
1723                         wxLogLastError(wxT("iconv"));
1724                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1725                                    nameCS.c_str());
1726                     }
1727                     else // ok, can convert to this encoding, remember it
1728                     {
1729                         ms_wcCharsetName = nameCS;
1730                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1731                     }
1732                 }
1733             }
1734             else // use charset not requiring byte swapping
1735             {
1736                 ms_wcCharsetName = nameXE;
1737             }
1738         }
1739
1740         wxLogTrace(TRACE_STRCONV,
1741                    wxT("iconv wchar_t charset is \"%s\"%s"),
1742                    ms_wcCharsetName.empty() ? _T("<none>")
1743                                             : ms_wcCharsetName.c_str(),
1744                    ms_wcNeedsSwap ? _T(" (needs swap)")
1745                                   : _T(""));
1746     }
1747     else // we already have ms_wcCharsetName
1748     {
1749         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1750     }
1751
1752     if ( ms_wcCharsetName.empty() )
1753     {
1754         w2m = ICONV_T_INVALID;
1755     }
1756     else
1757     {
1758         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1759         if ( w2m == ICONV_T_INVALID )
1760         {
1761             wxLogTrace(TRACE_STRCONV,
1762                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1763                        ms_wcCharsetName.c_str(), cname.data());
1764         }
1765     }
1766 }
1767
1768 wxMBConv_iconv::~wxMBConv_iconv()
1769 {
1770     if ( m2w != ICONV_T_INVALID )
1771         iconv_close(m2w);
1772     if ( w2m != ICONV_T_INVALID )
1773         iconv_close(w2m);
1774 }
1775
1776 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1777 {
1778     // find the string length: notice that must be done differently for
1779     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1780     size_t inbuf;
1781     const size_t nulLen = GetMBNulLen();
1782     switch ( nulLen )
1783     {
1784         default:
1785             return wxCONV_FAILED;
1786
1787         case 1:
1788             inbuf = strlen(psz); // arguably more optimized than our version
1789             break;
1790
1791         case 2:
1792         case 4:
1793             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1794             // they also have to start at character boundary and not span two
1795             // adjacent characters
1796             const char *p;
1797             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1798                 ;
1799             inbuf = p - psz;
1800             break;
1801     }
1802
1803 #if wxUSE_THREADS
1804     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1805     //     Unfortunately there is a couple of global wxCSConv objects such as
1806     //     wxConvLocal that are used all over wx code, so we have to make sure
1807     //     the handle is used by at most one thread at the time. Otherwise
1808     //     only a few wx classes would be safe to use from non-main threads
1809     //     as MB<->WC conversion would fail "randomly".
1810     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1811 #endif // wxUSE_THREADS
1812
1813     size_t outbuf = n * SIZEOF_WCHAR_T;
1814     size_t res, cres;
1815     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1816     wchar_t *bufPtr = buf;
1817     const char *pszPtr = psz;
1818
1819     if (buf)
1820     {
1821         // have destination buffer, convert there
1822         cres = iconv(m2w,
1823                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1824                      (char**)&bufPtr, &outbuf);
1825         res = n - (outbuf / SIZEOF_WCHAR_T);
1826
1827         if (ms_wcNeedsSwap)
1828         {
1829             // convert to native endianness
1830             for ( unsigned i = 0; i < res; i++ )
1831                 buf[n] = WC_BSWAP(buf[i]);
1832         }
1833
1834         // NUL-terminate the string if there is any space left
1835         if (res < n)
1836             buf[res] = 0;
1837     }
1838     else
1839     {
1840         // no destination buffer... convert using temp buffer
1841         // to calculate destination buffer requirement
1842         wchar_t tbuf[8];
1843         res = 0;
1844
1845         do
1846         {
1847             bufPtr = tbuf;
1848             outbuf = 8 * SIZEOF_WCHAR_T;
1849
1850             cres = iconv(m2w,
1851                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1852                          (char**)&bufPtr, &outbuf );
1853
1854             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1855         }
1856         while ((cres == (size_t)-1) && (errno == E2BIG));
1857     }
1858
1859     if (ICONV_FAILED(cres, inbuf))
1860     {
1861         //VS: it is ok if iconv fails, hence trace only
1862         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1863         return wxCONV_FAILED;
1864     }
1865
1866     return res;
1867 }
1868
1869 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1870 {
1871 #if wxUSE_THREADS
1872     // NB: explained in MB2WC
1873     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1874 #endif
1875
1876     size_t inlen = wxWcslen(psz);
1877     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1878     size_t outbuf = n;
1879     size_t res, cres;
1880
1881     wchar_t *tmpbuf = 0;
1882
1883     if (ms_wcNeedsSwap)
1884     {
1885         // need to copy to temp buffer to switch endianness
1886         // (doing WC_BSWAP twice on the original buffer won't help, as it
1887         //  could be in read-only memory, or be accessed in some other thread)
1888         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1889         for ( size_t i = 0; i < inlen; i++ )
1890             tmpbuf[n] = WC_BSWAP(psz[i]);
1891
1892         tmpbuf[inlen] = L'\0';
1893         psz = tmpbuf;
1894     }
1895
1896     if (buf)
1897     {
1898         // have destination buffer, convert there
1899         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1900
1901         res = n - outbuf;
1902
1903         // NB: iconv was given only wcslen(psz) characters on input, and so
1904         //     it couldn't convert the trailing zero. Let's do it ourselves
1905         //     if there's some room left for it in the output buffer.
1906         if (res < n)
1907             buf[0] = 0;
1908     }
1909     else
1910     {
1911         // no destination buffer: convert using temp buffer
1912         // to calculate destination buffer requirement
1913         char tbuf[16];
1914         res = 0;
1915         do
1916         {
1917             buf = tbuf;
1918             outbuf = 16;
1919
1920             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1921
1922             res += 16 - outbuf;
1923         }
1924         while ((cres == (size_t)-1) && (errno == E2BIG));
1925     }
1926
1927     if (ms_wcNeedsSwap)
1928     {
1929         free(tmpbuf);
1930     }
1931
1932     if (ICONV_FAILED(cres, inbuf))
1933     {
1934         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1935         return wxCONV_FAILED;
1936     }
1937
1938     return res;
1939 }
1940
1941 size_t wxMBConv_iconv::GetMBNulLen() const
1942 {
1943     if ( m_minMBCharWidth == 0 )
1944     {
1945         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1946
1947 #if wxUSE_THREADS
1948         // NB: explained in MB2WC
1949         wxMutexLocker lock(self->m_iconvMutex);
1950 #endif
1951
1952         wchar_t *wnul = L"";
1953         char buf[8]; // should be enough for NUL in any encoding
1954         size_t inLen = sizeof(wchar_t),
1955                outLen = WXSIZEOF(buf);
1956         char *inBuff = (char *)wnul;
1957         char *outBuff = buf;
1958         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1959         {
1960             self->m_minMBCharWidth = (size_t)-1;
1961         }
1962         else // ok
1963         {
1964             self->m_minMBCharWidth = outBuff - buf;
1965         }
1966     }
1967
1968     return m_minMBCharWidth;
1969 }
1970
1971 #endif // HAVE_ICONV
1972
1973
1974 // ============================================================================
1975 // Win32 conversion classes
1976 // ============================================================================
1977
1978 #ifdef wxHAVE_WIN32_MB2WC
1979
1980 // from utils.cpp
1981 #if wxUSE_FONTMAP
1982 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1983 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1984 #endif
1985
1986 class wxMBConv_win32 : public wxMBConv
1987 {
1988 public:
1989     wxMBConv_win32()
1990     {
1991         m_CodePage = CP_ACP;
1992         m_minMBCharWidth = 0;
1993     }
1994
1995     wxMBConv_win32(const wxMBConv_win32& conv)
1996     {
1997         m_CodePage = conv.m_CodePage;
1998         m_minMBCharWidth = conv.m_minMBCharWidth;
1999     }
2000
2001 #if wxUSE_FONTMAP
2002     wxMBConv_win32(const wxChar* name)
2003     {
2004         m_CodePage = wxCharsetToCodepage(name);
2005         m_minMBCharWidth = 0;
2006     }
2007
2008     wxMBConv_win32(wxFontEncoding encoding)
2009     {
2010         m_CodePage = wxEncodingToCodepage(encoding);
2011         m_minMBCharWidth = 0;
2012     }
2013 #endif // wxUSE_FONTMAP
2014
2015     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2016     {
2017         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2018         // the behaviour is not compatible with the Unix version (using iconv)
2019         // and break the library itself, e.g. wxTextInputStream::NextChar()
2020         // wouldn't work if reading an incomplete MB char didn't result in an
2021         // error
2022         //
2023         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2024         // Win XP or newer and it is not supported for UTF-[78] so we always
2025         // use our own conversions in this case. See
2026         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2027         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2028         if ( m_CodePage == CP_UTF8 )
2029         {
2030             return wxConvUTF8.MB2WC(buf, psz, n);
2031         }
2032
2033         if ( m_CodePage == CP_UTF7 )
2034         {
2035             return wxConvUTF7.MB2WC(buf, psz, n);
2036         }
2037
2038         int flags = 0;
2039         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2040                 IsAtLeastWin2kSP4() )
2041         {
2042             flags = MB_ERR_INVALID_CHARS;
2043         }
2044
2045         const size_t len = ::MultiByteToWideChar
2046                              (
2047                                 m_CodePage,     // code page
2048                                 flags,          // flags: fall on error
2049                                 psz,            // input string
2050                                 -1,             // its length (NUL-terminated)
2051                                 buf,            // output string
2052                                 buf ? n : 0     // size of output buffer
2053                              );
2054         if ( !len )
2055         {
2056             // function totally failed
2057             return wxCONV_FAILED;
2058         }
2059
2060         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2061         // check if we succeeded, by doing a double trip:
2062         if ( !flags && buf )
2063         {
2064             const size_t mbLen = strlen(psz);
2065             wxCharBuffer mbBuf(mbLen);
2066             if ( ::WideCharToMultiByte
2067                    (
2068                       m_CodePage,
2069                       0,
2070                       buf,
2071                       -1,
2072                       mbBuf.data(),
2073                       mbLen + 1,        // size in bytes, not length
2074                       NULL,
2075                       NULL
2076                    ) == 0 ||
2077                   strcmp(mbBuf, psz) != 0 )
2078             {
2079                 // we didn't obtain the same thing we started from, hence
2080                 // the conversion was lossy and we consider that it failed
2081                 return wxCONV_FAILED;
2082             }
2083         }
2084
2085         // note that it returns count of written chars for buf != NULL and size
2086         // of the needed buffer for buf == NULL so in either case the length of
2087         // the string (which never includes the terminating NUL) is one less
2088         return len - 1;
2089     }
2090
2091     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2092     {
2093         /*
2094             we have a problem here: by default, WideCharToMultiByte() may
2095             replace characters unrepresentable in the target code page with bad
2096             quality approximations such as turning "1/2" symbol (U+00BD) into
2097             "1" for the code pages which don't have it and we, obviously, want
2098             to avoid this at any price
2099
2100             the trouble is that this function does it _silently_, i.e. it won't
2101             even tell us whether it did or not... Win98/2000 and higher provide
2102             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2103             we have to resort to a round trip, i.e. check that converting back
2104             results in the same string -- this is, of course, expensive but
2105             otherwise we simply can't be sure to not garble the data.
2106          */
2107
2108         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2109         // it doesn't work with CJK encodings (which we test for rather roughly
2110         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2111         // supporting it
2112         BOOL usedDef wxDUMMY_INITIALIZE(false);
2113         BOOL *pUsedDef;
2114         int flags;
2115         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2116         {
2117             // it's our lucky day
2118             flags = WC_NO_BEST_FIT_CHARS;
2119             pUsedDef = &usedDef;
2120         }
2121         else // old system or unsupported encoding
2122         {
2123             flags = 0;
2124             pUsedDef = NULL;
2125         }
2126
2127         const size_t len = ::WideCharToMultiByte
2128                              (
2129                                 m_CodePage,     // code page
2130                                 flags,          // either none or no best fit
2131                                 pwz,            // input string
2132                                 -1,             // it is (wide) NUL-terminated
2133                                 buf,            // output buffer
2134                                 buf ? n : 0,    // and its size
2135                                 NULL,           // default "replacement" char
2136                                 pUsedDef        // [out] was it used?
2137                              );
2138
2139         if ( !len )
2140         {
2141             // function totally failed
2142             return wxCONV_FAILED;
2143         }
2144
2145         // if we were really converting, check if we succeeded
2146         if ( buf )
2147         {
2148             if ( flags )
2149             {
2150                 // check if the conversion failed, i.e. if any replacements
2151                 // were done
2152                 if ( usedDef )
2153                     return wxCONV_FAILED;
2154             }
2155             else // we must resort to double tripping...
2156             {
2157                 wxWCharBuffer wcBuf(n);
2158                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2159                         wcscmp(wcBuf, pwz) != 0 )
2160                 {
2161                     // we didn't obtain the same thing we started from, hence
2162                     // the conversion was lossy and we consider that it failed
2163                     return wxCONV_FAILED;
2164                 }
2165             }
2166         }
2167
2168         // see the comment above for the reason of "len - 1"
2169         return len - 1;
2170     }
2171
2172     virtual size_t GetMBNulLen() const
2173     {
2174         if ( m_minMBCharWidth == 0 )
2175         {
2176             int len = ::WideCharToMultiByte
2177                         (
2178                             m_CodePage,     // code page
2179                             0,              // no flags
2180                             L"",            // input string
2181                             1,              // translate just the NUL
2182                             NULL,           // output buffer
2183                             0,              // and its size
2184                             NULL,           // no replacement char
2185                             NULL            // [out] don't care if it was used
2186                         );
2187
2188             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2189             switch ( len )
2190             {
2191                 default:
2192                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2193                     self->m_minMBCharWidth = (size_t)-1;
2194                     break;
2195
2196                 case 0:
2197                     self->m_minMBCharWidth = (size_t)-1;
2198                     break;
2199
2200                 case 1:
2201                 case 2:
2202                 case 4:
2203                     self->m_minMBCharWidth = len;
2204                     break;
2205             }
2206         }
2207
2208         return m_minMBCharWidth;
2209     }
2210
2211     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2212
2213     bool IsOk() const { return m_CodePage != -1; }
2214
2215 private:
2216     static bool CanUseNoBestFit()
2217     {
2218         static int s_isWin98Or2k = -1;
2219
2220         if ( s_isWin98Or2k == -1 )
2221         {
2222             int verMaj, verMin;
2223             switch ( wxGetOsVersion(&verMaj, &verMin) )
2224             {
2225                 case wxWIN95:
2226                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2227                     break;
2228
2229                 case wxWINDOWS_NT:
2230                     s_isWin98Or2k = verMaj >= 5;
2231                     break;
2232
2233                 default:
2234                     // unknown: be conservative by default
2235                     s_isWin98Or2k = 0;
2236                     break;
2237             }
2238
2239             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2240         }
2241
2242         return s_isWin98Or2k == 1;
2243     }
2244
2245     static bool IsAtLeastWin2kSP4()
2246     {
2247 #ifdef __WXWINCE__
2248         return false;
2249 #else
2250         static int s_isAtLeastWin2kSP4 = -1;
2251
2252         if ( s_isAtLeastWin2kSP4 == -1 )
2253         {
2254             OSVERSIONINFOEX ver;
2255
2256             memset(&ver, 0, sizeof(ver));
2257             ver.dwOSVersionInfoSize = sizeof(ver);
2258             GetVersionEx((OSVERSIONINFO*)&ver);
2259
2260             s_isAtLeastWin2kSP4 =
2261               ((ver.dwMajorVersion > 5) || // Vista+
2262                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2263                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2264                ver.wServicePackMajor >= 4)) // 2000 SP4+
2265               ? 1 : 0;
2266         }
2267
2268         return s_isAtLeastWin2kSP4 == 1;
2269 #endif
2270     }
2271
2272
2273     // the code page we're working with
2274     long m_CodePage;
2275
2276     // cached result of GetMBNulLen(), set to 0 initially meaning
2277     // "unknown"
2278     size_t m_minMBCharWidth;
2279 };
2280
2281 #endif // wxHAVE_WIN32_MB2WC
2282
2283 // ============================================================================
2284 // Cocoa conversion classes
2285 // ============================================================================
2286
2287 #if defined(__WXCOCOA__)
2288
2289 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2290 // Strangely enough, internally Core Foundation uses
2291 // UTF-32 internally quite a bit - its just not public (yet).
2292
2293 #include <CoreFoundation/CFString.h>
2294 #include <CoreFoundation/CFStringEncodingExt.h>
2295
2296 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2297 {
2298     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2299
2300     switch (encoding)
2301     {
2302         case wxFONTENCODING_DEFAULT :
2303             enc = CFStringGetSystemEncoding();
2304             break ;
2305
2306         case wxFONTENCODING_ISO8859_1 :
2307             enc = kCFStringEncodingISOLatin1 ;
2308             break ;
2309         case wxFONTENCODING_ISO8859_2 :
2310             enc = kCFStringEncodingISOLatin2;
2311             break ;
2312         case wxFONTENCODING_ISO8859_3 :
2313             enc = kCFStringEncodingISOLatin3 ;
2314             break ;
2315         case wxFONTENCODING_ISO8859_4 :
2316             enc = kCFStringEncodingISOLatin4;
2317             break ;
2318         case wxFONTENCODING_ISO8859_5 :
2319             enc = kCFStringEncodingISOLatinCyrillic;
2320             break ;
2321         case wxFONTENCODING_ISO8859_6 :
2322             enc = kCFStringEncodingISOLatinArabic;
2323             break ;
2324         case wxFONTENCODING_ISO8859_7 :
2325             enc = kCFStringEncodingISOLatinGreek;
2326             break ;
2327         case wxFONTENCODING_ISO8859_8 :
2328             enc = kCFStringEncodingISOLatinHebrew;
2329             break ;
2330         case wxFONTENCODING_ISO8859_9 :
2331             enc = kCFStringEncodingISOLatin5;
2332             break ;
2333         case wxFONTENCODING_ISO8859_10 :
2334             enc = kCFStringEncodingISOLatin6;
2335             break ;
2336         case wxFONTENCODING_ISO8859_11 :
2337             enc = kCFStringEncodingISOLatinThai;
2338             break ;
2339         case wxFONTENCODING_ISO8859_13 :
2340             enc = kCFStringEncodingISOLatin7;
2341             break ;
2342         case wxFONTENCODING_ISO8859_14 :
2343             enc = kCFStringEncodingISOLatin8;
2344             break ;
2345         case wxFONTENCODING_ISO8859_15 :
2346             enc = kCFStringEncodingISOLatin9;
2347             break ;
2348
2349         case wxFONTENCODING_KOI8 :
2350             enc = kCFStringEncodingKOI8_R;
2351             break ;
2352         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2353             enc = kCFStringEncodingDOSRussian;
2354             break ;
2355
2356 //      case wxFONTENCODING_BULGARIAN :
2357 //          enc = ;
2358 //          break ;
2359
2360         case wxFONTENCODING_CP437 :
2361             enc = kCFStringEncodingDOSLatinUS ;
2362             break ;
2363         case wxFONTENCODING_CP850 :
2364             enc = kCFStringEncodingDOSLatin1;
2365             break ;
2366         case wxFONTENCODING_CP852 :
2367             enc = kCFStringEncodingDOSLatin2;
2368             break ;
2369         case wxFONTENCODING_CP855 :
2370             enc = kCFStringEncodingDOSCyrillic;
2371             break ;
2372         case wxFONTENCODING_CP866 :
2373             enc = kCFStringEncodingDOSRussian ;
2374             break ;
2375         case wxFONTENCODING_CP874 :
2376             enc = kCFStringEncodingDOSThai;
2377             break ;
2378         case wxFONTENCODING_CP932 :
2379             enc = kCFStringEncodingDOSJapanese;
2380             break ;
2381         case wxFONTENCODING_CP936 :
2382             enc = kCFStringEncodingDOSChineseSimplif ;
2383             break ;
2384         case wxFONTENCODING_CP949 :
2385             enc = kCFStringEncodingDOSKorean;
2386             break ;
2387         case wxFONTENCODING_CP950 :
2388             enc = kCFStringEncodingDOSChineseTrad;
2389             break ;
2390         case wxFONTENCODING_CP1250 :
2391             enc = kCFStringEncodingWindowsLatin2;
2392             break ;
2393         case wxFONTENCODING_CP1251 :
2394             enc = kCFStringEncodingWindowsCyrillic ;
2395             break ;
2396         case wxFONTENCODING_CP1252 :
2397             enc = kCFStringEncodingWindowsLatin1 ;
2398             break ;
2399         case wxFONTENCODING_CP1253 :
2400             enc = kCFStringEncodingWindowsGreek;
2401             break ;
2402         case wxFONTENCODING_CP1254 :
2403             enc = kCFStringEncodingWindowsLatin5;
2404             break ;
2405         case wxFONTENCODING_CP1255 :
2406             enc = kCFStringEncodingWindowsHebrew ;
2407             break ;
2408         case wxFONTENCODING_CP1256 :
2409             enc = kCFStringEncodingWindowsArabic ;
2410             break ;
2411         case wxFONTENCODING_CP1257 :
2412             enc = kCFStringEncodingWindowsBalticRim;
2413             break ;
2414 //   This only really encodes to UTF7 (if that) evidently
2415 //        case wxFONTENCODING_UTF7 :
2416 //            enc = kCFStringEncodingNonLossyASCII ;
2417 //            break ;
2418         case wxFONTENCODING_UTF8 :
2419             enc = kCFStringEncodingUTF8 ;
2420             break ;
2421         case wxFONTENCODING_EUC_JP :
2422             enc = kCFStringEncodingEUC_JP;
2423             break ;
2424         case wxFONTENCODING_UTF16 :
2425             enc = kCFStringEncodingUnicode ;
2426             break ;
2427         case wxFONTENCODING_MACROMAN :
2428             enc = kCFStringEncodingMacRoman ;
2429             break ;
2430         case wxFONTENCODING_MACJAPANESE :
2431             enc = kCFStringEncodingMacJapanese ;
2432             break ;
2433         case wxFONTENCODING_MACCHINESETRAD :
2434             enc = kCFStringEncodingMacChineseTrad ;
2435             break ;
2436         case wxFONTENCODING_MACKOREAN :
2437             enc = kCFStringEncodingMacKorean ;
2438             break ;
2439         case wxFONTENCODING_MACARABIC :
2440             enc = kCFStringEncodingMacArabic ;
2441             break ;
2442         case wxFONTENCODING_MACHEBREW :
2443             enc = kCFStringEncodingMacHebrew ;
2444             break ;
2445         case wxFONTENCODING_MACGREEK :
2446             enc = kCFStringEncodingMacGreek ;
2447             break ;
2448         case wxFONTENCODING_MACCYRILLIC :
2449             enc = kCFStringEncodingMacCyrillic ;
2450             break ;
2451         case wxFONTENCODING_MACDEVANAGARI :
2452             enc = kCFStringEncodingMacDevanagari ;
2453             break ;
2454         case wxFONTENCODING_MACGURMUKHI :
2455             enc = kCFStringEncodingMacGurmukhi ;
2456             break ;
2457         case wxFONTENCODING_MACGUJARATI :
2458             enc = kCFStringEncodingMacGujarati ;
2459             break ;
2460         case wxFONTENCODING_MACORIYA :
2461             enc = kCFStringEncodingMacOriya ;
2462             break ;
2463         case wxFONTENCODING_MACBENGALI :
2464             enc = kCFStringEncodingMacBengali ;
2465             break ;
2466         case wxFONTENCODING_MACTAMIL :
2467             enc = kCFStringEncodingMacTamil ;
2468             break ;
2469         case wxFONTENCODING_MACTELUGU :
2470             enc = kCFStringEncodingMacTelugu ;
2471             break ;
2472         case wxFONTENCODING_MACKANNADA :
2473             enc = kCFStringEncodingMacKannada ;
2474             break ;
2475         case wxFONTENCODING_MACMALAJALAM :
2476             enc = kCFStringEncodingMacMalayalam ;
2477             break ;
2478         case wxFONTENCODING_MACSINHALESE :
2479             enc = kCFStringEncodingMacSinhalese ;
2480             break ;
2481         case wxFONTENCODING_MACBURMESE :
2482             enc = kCFStringEncodingMacBurmese ;
2483             break ;
2484         case wxFONTENCODING_MACKHMER :
2485             enc = kCFStringEncodingMacKhmer ;
2486             break ;
2487         case wxFONTENCODING_MACTHAI :
2488             enc = kCFStringEncodingMacThai ;
2489             break ;
2490         case wxFONTENCODING_MACLAOTIAN :
2491             enc = kCFStringEncodingMacLaotian ;
2492             break ;
2493         case wxFONTENCODING_MACGEORGIAN :
2494             enc = kCFStringEncodingMacGeorgian ;
2495             break ;
2496         case wxFONTENCODING_MACARMENIAN :
2497             enc = kCFStringEncodingMacArmenian ;
2498             break ;
2499         case wxFONTENCODING_MACCHINESESIMP :
2500             enc = kCFStringEncodingMacChineseSimp ;
2501             break ;
2502         case wxFONTENCODING_MACTIBETAN :
2503             enc = kCFStringEncodingMacTibetan ;
2504             break ;
2505         case wxFONTENCODING_MACMONGOLIAN :
2506             enc = kCFStringEncodingMacMongolian ;
2507             break ;
2508         case wxFONTENCODING_MACETHIOPIC :
2509             enc = kCFStringEncodingMacEthiopic ;
2510             break ;
2511         case wxFONTENCODING_MACCENTRALEUR :
2512             enc = kCFStringEncodingMacCentralEurRoman ;
2513             break ;
2514         case wxFONTENCODING_MACVIATNAMESE :
2515             enc = kCFStringEncodingMacVietnamese ;
2516             break ;
2517         case wxFONTENCODING_MACARABICEXT :
2518             enc = kCFStringEncodingMacExtArabic ;
2519             break ;
2520         case wxFONTENCODING_MACSYMBOL :
2521             enc = kCFStringEncodingMacSymbol ;
2522             break ;
2523         case wxFONTENCODING_MACDINGBATS :
2524             enc = kCFStringEncodingMacDingbats ;
2525             break ;
2526         case wxFONTENCODING_MACTURKISH :
2527             enc = kCFStringEncodingMacTurkish ;
2528             break ;
2529         case wxFONTENCODING_MACCROATIAN :
2530             enc = kCFStringEncodingMacCroatian ;
2531             break ;
2532         case wxFONTENCODING_MACICELANDIC :
2533             enc = kCFStringEncodingMacIcelandic ;
2534             break ;
2535         case wxFONTENCODING_MACROMANIAN :
2536             enc = kCFStringEncodingMacRomanian ;
2537             break ;
2538         case wxFONTENCODING_MACCELTIC :
2539             enc = kCFStringEncodingMacCeltic ;
2540             break ;
2541         case wxFONTENCODING_MACGAELIC :
2542             enc = kCFStringEncodingMacGaelic ;
2543             break ;
2544 //      case wxFONTENCODING_MACKEYBOARD :
2545 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2546 //          break ;
2547
2548         default :
2549             // because gcc is picky
2550             break ;
2551     }
2552
2553     return enc ;
2554 }
2555
2556 class wxMBConv_cocoa : public wxMBConv
2557 {
2558 public:
2559     wxMBConv_cocoa()
2560     {
2561         Init(CFStringGetSystemEncoding()) ;
2562     }
2563
2564     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2565     {
2566         m_encoding = conv.m_encoding;
2567     }
2568
2569 #if wxUSE_FONTMAP
2570     wxMBConv_cocoa(const wxChar* name)
2571     {
2572         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2573     }
2574 #endif
2575
2576     wxMBConv_cocoa(wxFontEncoding encoding)
2577     {
2578         Init( wxCFStringEncFromFontEnc(encoding) );
2579     }
2580
2581     ~wxMBConv_cocoa()
2582     {
2583     }
2584
2585     void Init( CFStringEncoding encoding)
2586     {
2587         m_encoding = encoding ;
2588     }
2589
2590     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2591     {
2592         wxASSERT(szUnConv);
2593
2594         CFStringRef theString = CFStringCreateWithBytes (
2595                                                 NULL, //the allocator
2596                                                 (const UInt8*)szUnConv,
2597                                                 strlen(szUnConv),
2598                                                 m_encoding,
2599                                                 false //no BOM/external representation
2600                                                 );
2601
2602         wxASSERT(theString);
2603
2604         size_t nOutLength = CFStringGetLength(theString);
2605
2606         if (szOut == NULL)
2607         {
2608             CFRelease(theString);
2609             return nOutLength;
2610         }
2611
2612         CFRange theRange = { 0, nOutSize };
2613
2614 #if SIZEOF_WCHAR_T == 4
2615         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2616 #endif
2617
2618         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2619
2620         CFRelease(theString);
2621
2622         szUniCharBuffer[nOutLength] = '\0';
2623
2624 #if SIZEOF_WCHAR_T == 4
2625         wxMBConvUTF16 converter;
2626         converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2627         delete [] szUniCharBuffer;
2628 #endif
2629
2630         return nOutLength;
2631     }
2632
2633     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2634     {
2635         wxASSERT(szUnConv);
2636
2637         size_t nRealOutSize;
2638         size_t nBufSize = wxWcslen(szUnConv);
2639         UniChar* szUniBuffer = (UniChar*) szUnConv;
2640
2641 #if SIZEOF_WCHAR_T == 4
2642         wxMBConvUTF16 converter ;
2643         nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2644         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2645         converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2646         nBufSize /= sizeof(UniChar);
2647 #endif
2648
2649         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2650                                 NULL, //allocator
2651                                 szUniBuffer,
2652                                 nBufSize,
2653                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2654                             );
2655
2656         wxASSERT(theString);
2657
2658         //Note that CER puts a BOM when converting to unicode
2659         //so we  check and use getchars instead in that case
2660         if (m_encoding == kCFStringEncodingUnicode)
2661         {
2662             if (szOut != NULL)
2663                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2664
2665             nRealOutSize = CFStringGetLength(theString) + 1;
2666         }
2667         else
2668         {
2669             CFStringGetBytes(
2670                 theString,
2671                 CFRangeMake(0, CFStringGetLength(theString)),
2672                 m_encoding,
2673                 0, //what to put in characters that can't be converted -
2674                     //0 tells CFString to return NULL if it meets such a character
2675                 false, //not an external representation
2676                 (UInt8*) szOut,
2677                 nOutSize,
2678                 (CFIndex*) &nRealOutSize
2679                         );
2680         }
2681
2682         CFRelease(theString);
2683
2684 #if SIZEOF_WCHAR_T == 4
2685         delete[] szUniBuffer;
2686 #endif
2687
2688         return  nRealOutSize - 1;
2689     }
2690
2691     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2692
2693     bool IsOk() const
2694     {
2695         return m_encoding != kCFStringEncodingInvalidId &&
2696               CFStringIsEncodingAvailable(m_encoding);
2697     }
2698
2699 private:
2700     CFStringEncoding m_encoding ;
2701 };
2702
2703 #endif // defined(__WXCOCOA__)
2704
2705 // ============================================================================
2706 // Mac conversion classes
2707 // ============================================================================
2708
2709 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2710
2711 class wxMBConv_mac : public wxMBConv
2712 {
2713 public:
2714     wxMBConv_mac()
2715     {
2716         Init(CFStringGetSystemEncoding()) ;
2717     }
2718
2719     wxMBConv_mac(const wxMBConv_mac& conv)
2720     {
2721         Init(conv.m_char_encoding);
2722     }
2723
2724 #if wxUSE_FONTMAP
2725     wxMBConv_mac(const wxChar* name)
2726     {
2727         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2728     }
2729 #endif
2730
2731     wxMBConv_mac(wxFontEncoding encoding)
2732     {
2733         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2734     }
2735
2736     ~wxMBConv_mac()
2737     {
2738         OSStatus status = noErr ;
2739         status = TECDisposeConverter(m_MB2WC_converter);
2740         status = TECDisposeConverter(m_WC2MB_converter);
2741     }
2742
2743
2744     void Init( TextEncodingBase encoding)
2745     {
2746         OSStatus status = noErr ;
2747         m_char_encoding = encoding ;
2748         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2749
2750         status = TECCreateConverter(&m_MB2WC_converter,
2751                                     m_char_encoding,
2752                                     m_unicode_encoding);
2753         status = TECCreateConverter(&m_WC2MB_converter,
2754                                     m_unicode_encoding,
2755                                     m_char_encoding);
2756     }
2757
2758     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2759     {
2760         OSStatus status = noErr ;
2761         ByteCount byteOutLen ;
2762         ByteCount byteInLen = strlen(psz) ;
2763         wchar_t *tbuf = NULL ;
2764         UniChar* ubuf = NULL ;
2765         size_t res = 0 ;
2766
2767         if (buf == NULL)
2768         {
2769             // Apple specs say at least 32
2770             n = wxMax( 32, byteInLen ) ;
2771             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2772         }
2773
2774         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2775
2776 #if SIZEOF_WCHAR_T == 4
2777         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2778 #else
2779         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2780 #endif
2781
2782         status = TECConvertText(
2783             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2784             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2785
2786 #if SIZEOF_WCHAR_T == 4
2787         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2788         // is not properly terminated we get random characters at the end
2789         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2790         wxMBConvUTF16 converter ;
2791         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2792         free( ubuf ) ;
2793 #else
2794         res = byteOutLen / sizeof( UniChar ) ;
2795 #endif
2796
2797         if ( buf == NULL )
2798              free(tbuf) ;
2799
2800         if ( buf  && res < n)
2801             buf[res] = 0;
2802
2803         return res ;
2804     }
2805
2806     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2807     {
2808         OSStatus status = noErr ;
2809         ByteCount byteOutLen ;
2810         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2811
2812         char *tbuf = NULL ;
2813
2814         if (buf == NULL)
2815         {
2816             // Apple specs say at least 32
2817             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2818             tbuf = (char*) malloc( n ) ;
2819         }
2820
2821         ByteCount byteBufferLen = n ;
2822         UniChar* ubuf = NULL ;
2823
2824 #if SIZEOF_WCHAR_T == 4
2825         wxMBConvUTF16 converter ;
2826         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2827         byteInLen = unicharlen ;
2828         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2829         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2830 #else
2831         ubuf = (UniChar*) psz ;
2832 #endif
2833
2834         status = TECConvertText(
2835             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2836             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2837
2838 #if SIZEOF_WCHAR_T == 4
2839         free( ubuf ) ;
2840 #endif
2841
2842         if ( buf == NULL )
2843             free(tbuf) ;
2844
2845         size_t res = byteOutLen ;
2846         if ( buf  && res < n)
2847         {
2848             buf[res] = 0;
2849
2850             //we need to double-trip to verify it didn't insert any ? in place
2851             //of bogus characters
2852             wxWCharBuffer wcBuf(n);
2853             size_t pszlen = wxWcslen(psz);
2854             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2855                         wxWcslen(wcBuf) != pszlen ||
2856                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2857             {
2858                 // we didn't obtain the same thing we started from, hence
2859                 // the conversion was lossy and we consider that it failed
2860                 return wxCONV_FAILED;
2861             }
2862         }
2863
2864         return res ;
2865     }
2866
2867     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2868
2869     bool IsOk() const
2870         { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2871
2872 private:
2873     TECObjectRef m_MB2WC_converter;
2874     TECObjectRef m_WC2MB_converter;
2875
2876     TextEncodingBase m_char_encoding;
2877     TextEncodingBase m_unicode_encoding;
2878 };
2879
2880 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2881
2882 // ============================================================================
2883 // wxEncodingConverter based conversion classes
2884 // ============================================================================
2885
2886 #if wxUSE_FONTMAP
2887
2888 class wxMBConv_wxwin : public wxMBConv
2889 {
2890 private:
2891     void Init()
2892     {
2893         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2894                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2895     }
2896
2897 public:
2898     // temporarily just use wxEncodingConverter stuff,
2899     // so that it works while a better implementation is built
2900     wxMBConv_wxwin(const wxChar* name)
2901     {
2902         if (name)
2903             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2904         else
2905             m_enc = wxFONTENCODING_SYSTEM;
2906
2907         Init();
2908     }
2909
2910     wxMBConv_wxwin(wxFontEncoding enc)
2911     {
2912         m_enc = enc;
2913
2914         Init();
2915     }
2916
2917     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2918     {
2919         size_t inbuf = strlen(psz);
2920         if (buf)
2921         {
2922             if (!m2w.Convert(psz, buf))
2923                 return wxCONV_FAILED;
2924         }
2925         return inbuf;
2926     }
2927
2928     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2929     {
2930         const size_t inbuf = wxWcslen(psz);
2931         if (buf)
2932         {
2933             if (!w2m.Convert(psz, buf))
2934                 return wxCONV_FAILED;
2935         }
2936
2937         return inbuf;
2938     }
2939
2940     virtual size_t GetMBNulLen() const
2941     {
2942         switch ( m_enc )
2943         {
2944             case wxFONTENCODING_UTF16BE:
2945             case wxFONTENCODING_UTF16LE:
2946                 return 2;
2947
2948             case wxFONTENCODING_UTF32BE:
2949             case wxFONTENCODING_UTF32LE:
2950                 return 4;
2951
2952             default:
2953                 return 1;
2954         }
2955     }
2956
2957     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2958
2959     bool IsOk() const { return m_ok; }
2960
2961 public:
2962     wxFontEncoding m_enc;
2963     wxEncodingConverter m2w, w2m;
2964
2965 private:
2966     // were we initialized successfully?
2967     bool m_ok;
2968
2969     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2970 };
2971
2972 // make the constructors available for unit testing
2973 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2974 {
2975     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2976     if ( !result->IsOk() )
2977     {
2978         delete result;
2979         return 0;
2980     }
2981
2982     return result;
2983 }
2984
2985 #endif // wxUSE_FONTMAP
2986
2987 // ============================================================================
2988 // wxCSConv implementation
2989 // ============================================================================
2990
2991 void wxCSConv::Init()
2992 {
2993     m_name = NULL;
2994     m_convReal =  NULL;
2995     m_deferred = true;
2996 }
2997
2998 wxCSConv::wxCSConv(const wxChar *charset)
2999 {
3000     Init();
3001
3002     if ( charset )
3003     {
3004         SetName(charset);
3005     }
3006
3007 #if wxUSE_FONTMAP
3008     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3009 #else
3010     m_encoding = wxFONTENCODING_SYSTEM;
3011 #endif
3012 }
3013
3014 wxCSConv::wxCSConv(wxFontEncoding encoding)
3015 {
3016     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3017     {
3018         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3019
3020         encoding = wxFONTENCODING_SYSTEM;
3021     }
3022
3023     Init();
3024
3025     m_encoding = encoding;
3026 }
3027
3028 wxCSConv::~wxCSConv()
3029 {
3030     Clear();
3031 }
3032
3033 wxCSConv::wxCSConv(const wxCSConv& conv)
3034         : wxMBConv()
3035 {
3036     Init();
3037
3038     SetName(conv.m_name);
3039     m_encoding = conv.m_encoding;
3040 }
3041
3042 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3043 {
3044     Clear();
3045
3046     SetName(conv.m_name);
3047     m_encoding = conv.m_encoding;
3048
3049     return *this;
3050 }
3051
3052 void wxCSConv::Clear()
3053 {
3054     free(m_name);
3055     delete m_convReal;
3056
3057     m_name = NULL;
3058     m_convReal = NULL;
3059 }
3060
3061 void wxCSConv::SetName(const wxChar *charset)
3062 {
3063     if (charset)
3064     {
3065         m_name = wxStrdup(charset);
3066         m_deferred = true;
3067     }
3068 }
3069
3070 #if wxUSE_FONTMAP
3071 #include "wx/hashmap.h"
3072
3073 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3074                      wxEncodingNameCache );
3075
3076 static wxEncodingNameCache gs_nameCache;
3077 #endif
3078
3079 wxMBConv *wxCSConv::DoCreate() const
3080 {
3081 #if wxUSE_FONTMAP
3082     wxLogTrace(TRACE_STRCONV,
3083                wxT("creating conversion for %s"),
3084                (m_name ? m_name
3085                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3086 #endif // wxUSE_FONTMAP
3087
3088     // check for the special case of ASCII or ISO8859-1 charset: as we have
3089     // special knowledge of it anyhow, we don't need to create a special
3090     // conversion object
3091     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3092             m_encoding == wxFONTENCODING_DEFAULT )
3093     {
3094         // don't convert at all
3095         return NULL;
3096     }
3097
3098     // we trust OS to do conversion better than we can so try external
3099     // conversion methods first
3100     //
3101     // the full order is:
3102     //      1. OS conversion (iconv() under Unix or Win32 API)
3103     //      2. hard coded conversions for UTF
3104     //      3. wxEncodingConverter as fall back
3105
3106     // step (1)
3107 #ifdef HAVE_ICONV
3108 #if !wxUSE_FONTMAP
3109     if ( m_name )
3110 #endif // !wxUSE_FONTMAP
3111     {
3112         wxString name(m_name);
3113         wxFontEncoding encoding(m_encoding);
3114
3115         if ( !name.empty() )
3116         {
3117             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3118             if ( conv->IsOk() )
3119                 return conv;
3120
3121             delete conv;
3122
3123 #if wxUSE_FONTMAP
3124             encoding =
3125                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3126 #endif // wxUSE_FONTMAP
3127         }
3128 #if wxUSE_FONTMAP
3129         {
3130             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3131             if ( it != gs_nameCache.end() )
3132             {
3133                 if ( it->second.empty() )
3134                     return NULL;
3135
3136                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3137                 if ( conv->IsOk() )
3138                     return conv;
3139
3140                 delete conv;
3141             }
3142
3143             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3144
3145             for ( ; *names; ++names )
3146             {
3147                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3148                 if ( conv->IsOk() )
3149                 {
3150                     gs_nameCache[encoding] = *names;
3151                     return conv;
3152                 }
3153
3154                 delete conv;
3155             }
3156
3157             gs_nameCache[encoding] = _T(""); // cache the failure
3158         }
3159 #endif // wxUSE_FONTMAP
3160     }
3161 #endif // HAVE_ICONV
3162
3163 #ifdef wxHAVE_WIN32_MB2WC
3164     {
3165 #if wxUSE_FONTMAP
3166         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3167                                       : new wxMBConv_win32(m_encoding);
3168         if ( conv->IsOk() )
3169             return conv;
3170
3171         delete conv;
3172 #else
3173         return NULL;
3174 #endif
3175     }
3176 #endif // wxHAVE_WIN32_MB2WC
3177
3178 #if defined(__WXMAC__)
3179     {
3180         // leave UTF16 and UTF32 to the built-ins of wx
3181         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3182             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3183         {
3184 #if wxUSE_FONTMAP
3185             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3186                                         : new wxMBConv_mac(m_encoding);
3187 #else
3188             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3189 #endif
3190             if ( conv->IsOk() )
3191                  return conv;
3192
3193             delete conv;
3194         }
3195     }
3196 #endif
3197
3198 #if defined(__WXCOCOA__)
3199     {
3200         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3201         {
3202 #if wxUSE_FONTMAP
3203             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3204                                           : new wxMBConv_cocoa(m_encoding);
3205 #else
3206             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3207 #endif
3208
3209             if ( conv->IsOk() )
3210                  return conv;
3211
3212             delete conv;
3213         }
3214     }
3215 #endif
3216     // step (2)
3217     wxFontEncoding enc = m_encoding;
3218 #if wxUSE_FONTMAP
3219     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3220     {
3221         // use "false" to suppress interactive dialogs -- we can be called from
3222         // anywhere and popping up a dialog from here is the last thing we want to
3223         // do
3224         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3225     }
3226 #endif // wxUSE_FONTMAP
3227
3228     switch ( enc )
3229     {
3230         case wxFONTENCODING_UTF7:
3231              return new wxMBConvUTF7;
3232
3233         case wxFONTENCODING_UTF8:
3234              return new wxMBConvUTF8;
3235
3236         case wxFONTENCODING_UTF16BE:
3237              return new wxMBConvUTF16BE;
3238
3239         case wxFONTENCODING_UTF16LE:
3240              return new wxMBConvUTF16LE;
3241
3242         case wxFONTENCODING_UTF32BE:
3243              return new wxMBConvUTF32BE;
3244
3245         case wxFONTENCODING_UTF32LE:
3246              return new wxMBConvUTF32LE;
3247
3248         default:
3249              // nothing to do but put here to suppress gcc warnings
3250              break;
3251     }
3252
3253     // step (3)
3254 #if wxUSE_FONTMAP
3255     {
3256         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3257                                       : new wxMBConv_wxwin(m_encoding);
3258         if ( conv->IsOk() )
3259             return conv;
3260
3261         delete conv;
3262     }
3263 #endif // wxUSE_FONTMAP
3264
3265     // NB: This is a hack to prevent deadlock. What could otherwise happen
3266     //     in Unicode build: wxConvLocal creation ends up being here
3267     //     because of some failure and logs the error. But wxLog will try to
3268     //     attach timestamp, for which it will need wxConvLocal (to convert
3269     //     time to char* and then wchar_t*), but that fails, tries to log
3270     //     error, but wxLog has a (already locked) critical section that
3271     //     guards static buffer.
3272     static bool alreadyLoggingError = false;
3273     if (!alreadyLoggingError)
3274     {
3275         alreadyLoggingError = true;
3276         wxLogError(_("Cannot convert from the charset '%s'!"),
3277                    m_name ? m_name
3278                       :
3279 #if wxUSE_FONTMAP
3280                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3281 #else // !wxUSE_FONTMAP
3282                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3283 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3284               );
3285
3286         alreadyLoggingError = false;
3287     }
3288
3289     return NULL;
3290 }
3291
3292 void wxCSConv::CreateConvIfNeeded() const
3293 {
3294     if ( m_deferred )
3295     {
3296         wxCSConv *self = (wxCSConv *)this; // const_cast
3297
3298 #if wxUSE_INTL
3299         // if we don't have neither the name nor the encoding, use the default
3300         // encoding for this system
3301         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3302         {
3303             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3304         }
3305 #endif // wxUSE_INTL
3306
3307         self->m_convReal = DoCreate();
3308         self->m_deferred = false;
3309     }
3310 }
3311
3312 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3313 {
3314     CreateConvIfNeeded();
3315
3316     if (m_convReal)
3317         return m_convReal->MB2WC(buf, psz, n);
3318
3319     // latin-1 (direct)
3320     size_t len = strlen(psz);
3321
3322     if (buf)
3323     {
3324         for (size_t c = 0; c <= len; c++)
3325             buf[c] = (unsigned char)(psz[c]);
3326     }
3327
3328     return len;
3329 }
3330
3331 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3332 {
3333     CreateConvIfNeeded();
3334
3335     if (m_convReal)
3336         return m_convReal->WC2MB(buf, psz, n);
3337
3338     // latin-1 (direct)
3339     const size_t len = wxWcslen(psz);
3340     if (buf)
3341     {
3342         for (size_t c = 0; c <= len; c++)
3343         {
3344             if (psz[c] > 0xFF)
3345                 return wxCONV_FAILED;
3346
3347             buf[c] = (char)psz[c];
3348         }
3349     }
3350     else
3351     {
3352         for (size_t c = 0; c <= len; c++)
3353         {
3354             if (psz[c] > 0xFF)
3355                 return wxCONV_FAILED;
3356         }
3357     }
3358
3359     return len;
3360 }
3361
3362 size_t wxCSConv::GetMBNulLen() const
3363 {
3364     CreateConvIfNeeded();
3365
3366     if ( m_convReal )
3367     {
3368         return m_convReal->GetMBNulLen();
3369     }
3370
3371     return 1;
3372 }
3373
3374 // ----------------------------------------------------------------------------
3375 // globals
3376 // ----------------------------------------------------------------------------
3377
3378 #ifdef __WINDOWS__
3379     static wxMBConv_win32 wxConvLibcObj;
3380 #elif defined(__WXMAC__) && !defined(__MACH__)
3381     static wxMBConv_mac wxConvLibcObj ;
3382 #else
3383     static wxMBConvLibc wxConvLibcObj;
3384 #endif
3385
3386 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3387 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3388 static wxMBConvUTF7 wxConvUTF7Obj;
3389 static wxMBConvUTF8 wxConvUTF8Obj;
3390
3391 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3392 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3393 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3394 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3395 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3396 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3397 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3398 #ifdef __WXOSX__
3399                                     wxConvUTF8Obj;
3400 #else
3401                                     wxConvLibcObj;
3402 #endif
3403
3404
3405 #else // !wxUSE_WCHAR_T
3406
3407 // stand-ins in absence of wchar_t
3408 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3409                                 wxConvISO8859_1,
3410                                 wxConvLocal,
3411                                 wxConvUTF8;
3412
3413 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T