src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  90 static bool NotAllNULs(const char *p, size_t n)
  91 {
  92     while ( n && *p++ == '\0' )
  93         n--;
  94
  95     return n != 0;
  96 }
  97
  98 // ----------------------------------------------------------------------------
  99 // UTF-16 en/decoding to/from UCS-4
 100 // ----------------------------------------------------------------------------
 101
 102
 103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 104 {
 105     if (input<=0xffff)
 106     {
 107         if (output)
 108             *output = (wxUint16) input;
 109         return 1;
 110     }
 111     else if (input>=0x110000)
 112     {
 113         return (size_t)-1;
 114     }
 115     else
 116     {
 117         if (output)
 118         {
 119             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 120             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 121         }
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input<0xd800) || (*input>0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 134     {
 135         output = *input;
 136         return (size_t)-1;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145
 146 // ----------------------------------------------------------------------------
 147 // wxMBConv
 148 // ----------------------------------------------------------------------------
 149
 150 size_t
 151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 152                   const char *src, size_t srcLen) const
 153 {
 154     // although new conversion classes are supposed to implement this function
 155     // directly, the existins ones only implement the old MB2WC() and so, to
 156     // avoid to have to rewrite all conversion classes at once, we provide a
 157     // default (but not efficient) implementation of this one in terms of the
 158     // old function by copying the input to ensure that it's NUL-terminated and
 159     // then using MB2WC() to convert it
 160
 161     // the number of chars [which would be] written to dst [if it were not NULL]
 162     size_t dstWritten = 0;
 163
 164     // the number of NULs terminating this string
 165     size_t nulLen wxDUMMY_INITIALIZE(0);
 166
 167     // if we were not given the input size we just have to assume that the
 168     // string is properly terminated as we have no way of knowing how long it
 169     // is anyhow, but if we do have the size check whether there are enough
 170     // NULs at the end
 171     wxCharBuffer bufTmp;
 172     const char *srcEnd;
 173     if ( srcLen != (size_t)-1 )
 174     {
 175         // we need to know how to find the end of this string
 176         nulLen = GetMBNulLen();
 177         if ( nulLen == wxCONV_FAILED )
 178             return wxCONV_FAILED;
 179
 180         // if there are enough NULs we can avoid the copy
 181         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 182         {
 183             // make a copy in order to properly NUL-terminate the string
 184             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 185             char * const p = bufTmp.data();
 186             memcpy(p, src, srcLen);
 187             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 188                 *s = '\0';
 189
 190             src = bufTmp;
 191         }
 192
 193         srcEnd = src + srcLen;
 194     }
 195     else // quit after the first loop iteration
 196     {
 197         srcEnd = NULL;
 198     }
 199
 200     for ( ;; )
 201     {
 202         // try to convert the current chunk
 203         size_t lenChunk = MB2WC(NULL, src, 0);
 204         if ( lenChunk == 0 )
 205         {
 206             // nothing left in the input string, conversion succeeded; but
 207             // still account for the trailing NULL
 208             dstWritten++;
 209             break;
 210         }
 211
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for trailing NUL
 216
 217         dstWritten += lenChunk;
 218
 219         if ( dst )
 220         {
 221             if ( dstWritten > dstLen )
 222                 return wxCONV_FAILED;
 223
 224             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 225                 return wxCONV_FAILED;
 226
 227             dst += lenChunk;
 228         }
 229
 230         if ( !srcEnd )
 231         {
 232             // we convert the entire string in this cas, as we suppose that the
 233             // string is NUL-terminated and so srcEnd is not used at all
 234             break;
 235         }
 236
 237         // advance the input pointer past the end of this chunk
 238         while ( NotAllNULs(src, nulLen) )
 239         {
 240             // notice that we must skip over multiple bytes here as we suppose
 241             // that if NUL takes 2 or 4 bytes, then all the other characters do
 242             // too and so if advanced by a single byte we might erroneously
 243             // detect sequences of NUL bytes in the middle of the input
 244             src += nulLen;
 245         }
 246
 247         src += nulLen; // skipping over its terminator as well
 248
 249         // note that ">=" (and not just "==") is needed here as the terminator
 250         // we skipped just above could be inside or just after the buffer
 251         // delimited by inEnd
 252         if ( src >= srcEnd )
 253             break;
 254     }
 255
 256     return dstWritten;
 257 }
 258
 259 size_t
 260 wxMBConv::FromWChar(char *dst, size_t dstLen,
 261                     const wchar_t *src, size_t srcLen) const
 262 {
 263     // the number of chars [which would be] written to dst [if it were not NULL]
 264     size_t dstWritten = 0;
 265
 266     // make a copy of the input string unless it is already properly
 267     // NUL-terminated
 268     //
 269     // if we don't know its length we have no choice but to assume that it is,
 270     // indeed, properly terminated
 271     wxWCharBuffer bufTmp;
 272     if ( srcLen == (size_t)-1 )
 273     {
 274         srcLen = wxWcslen(src) + 1;
 275     }
 276     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 277     {
 278         // make a copy in order to properly NUL-terminate the string
 279         bufTmp = wxWCharBuffer(srcLen);
 280         memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
 281         src = bufTmp;
 282     }
 283
 284     const size_t lenNul = GetMBNulLen();
 285     for ( const wchar_t * const srcEnd = src + srcLen;
 286           src < srcEnd;
 287           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 288     {
 289         // try to convert the current chunk
 290         size_t lenChunk = WC2MB(NULL, src, 0);
 291
 292         if ( lenChunk == wxCONV_FAILED )
 293             return wxCONV_FAILED;
 294
 295         lenChunk += lenNul;
 296         dstWritten += lenChunk;
 297
 298         if ( dst )
 299         {
 300             if ( dstWritten > dstLen )
 301                 return wxCONV_FAILED;
 302
 303             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 304                 return wxCONV_FAILED;
 305
 306             dst += lenChunk;
 307         }
 308     }
 309
 310     return dstWritten;
 311 }
 312
 313 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
 314 {
 315     size_t rc = ToWChar(out, outLen, in);
 316     if ( rc != wxCONV_FAILED )
 317     {
 318         // ToWChar() returns the buffer length, i.e. including the trailing
 319         // NUL, while this method doesn't take it into account
 320         rc--;
 321     }
 322
 323     return rc;
 324 }
 325
 326 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
 327 {
 328     size_t rc = FromWChar(out, outLen, in);
 329     if ( rc != wxCONV_FAILED )
 330     {
 331         rc -= GetMBNulLen();
 332     }
 333
 334     return rc;
 335 }
 336
 337 wxMBConv::~wxMBConv()
 338 {
 339     // nothing to do here (necessary for Darwin linking probably)
 340 }
 341
 342 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 343 {
 344     if ( psz )
 345     {
 346         // calculate the length of the buffer needed first
 347         const size_t nLen = MB2WC(NULL, psz, 0);
 348         if ( nLen != wxCONV_FAILED )
 349         {
 350             // now do the actual conversion
 351             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 352
 353             // +1 for the trailing NULL
 354             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 355                 return buf;
 356         }
 357     }
 358
 359     return wxWCharBuffer();
 360 }
 361
 362 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 363 {
 364     if ( pwz )
 365     {
 366         const size_t nLen = WC2MB(NULL, pwz, 0);
 367         if ( nLen != wxCONV_FAILED )
 368         {
 369             // extra space for trailing NUL(s)
 370             static const size_t extraLen = GetMaxMBNulLen();
 371
 372             wxCharBuffer buf(nLen + extraLen - 1);
 373             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 374                 return buf;
 375         }
 376     }
 377
 378     return wxCharBuffer();
 379 }
 380
 381 const wxWCharBuffer
 382 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 383 {
 384     const size_t dstLen = ToWChar(NULL, 0, in, inLen);
 385     if ( dstLen != wxCONV_FAILED )
 386     {
 387         wxWCharBuffer wbuf(dstLen - 1);
 388         if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
 389         {
 390             if ( outLen )
 391                 *outLen = dstLen - 1;
 392             return wbuf;
 393         }
 394     }
 395
 396     if ( outLen )
 397         *outLen = 0;
 398
 399     return wxWCharBuffer();
 400 }
 401
 402 const wxCharBuffer
 403 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 404 {
 405     const size_t dstLen = FromWChar(NULL, 0, in, inLen);
 406     if ( dstLen != wxCONV_FAILED )
 407     {
 408         wxCharBuffer buf(dstLen - 1);
 409         if ( FromWChar(buf.data(), dstLen, in, inLen) )
 410         {
 411             if ( outLen )
 412                 *outLen = dstLen - 1;
 413             return buf;
 414         }
 415     }
 416
 417     if ( outLen )
 418         *outLen = 0;
 419
 420     return wxCharBuffer();
 421 }
 422
 423 // ----------------------------------------------------------------------------
 424 // wxMBConvLibc
 425 // ----------------------------------------------------------------------------
 426
 427 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 428 {
 429     return wxMB2WC(buf, psz, n);
 430 }
 431
 432 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 433 {
 434     return wxWC2MB(buf, psz, n);
 435 }
 436
 437 // ----------------------------------------------------------------------------
 438 // wxConvBrokenFileNames
 439 // ----------------------------------------------------------------------------
 440
 441 #ifdef __UNIX__
 442
 443 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 444 {
 445     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 446                   || wxStricmp(charset, _T("UTF8")) == 0  )
 447         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 448     else
 449         m_conv = new wxCSConv(charset);
 450 }
 451
 452 #endif // __UNIX__
 453
 454 // ----------------------------------------------------------------------------
 455 // UTF-7
 456 // ----------------------------------------------------------------------------
 457
 458 // Implementation (C) 2004 Fredrik Roubert
 459
 460 //
 461 // BASE64 decoding table
 462 //
 463 static const unsigned char utf7unb64[] =
 464 {
 465     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 466     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 467     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 468     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 469     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 470     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 471     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 472     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 473     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 474     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 475     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 476     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 477     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 478     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 479     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 480     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 481     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 482     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 483     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 484     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 485     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 486     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 487     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 488     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 489     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 490     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 497 };
 498
 499 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 500 {
 501     size_t len = 0;
 502
 503     while ( *psz && (!buf || (len < n)) )
 504     {
 505         unsigned char cc = *psz++;
 506         if (cc != '+')
 507         {
 508             // plain ASCII char
 509             if (buf)
 510                 *buf++ = cc;
 511             len++;
 512         }
 513         else if (*psz == '-')
 514         {
 515             // encoded plus sign
 516             if (buf)
 517                 *buf++ = cc;
 518             len++;
 519             psz++;
 520         }
 521         else // start of BASE64 encoded string
 522         {
 523             bool lsb, ok;
 524             unsigned int d, l;
 525             for ( ok = lsb = false, d = 0, l = 0;
 526                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 527                   psz++ )
 528             {
 529                 d <<= 6;
 530                 d += cc;
 531                 for (l += 6; l >= 8; lsb = !lsb)
 532                 {
 533                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 534                     if (lsb)
 535                     {
 536                         if (buf)
 537                             *buf++ |= c;
 538                         len ++;
 539                     }
 540                     else
 541                     {
 542                         if (buf)
 543                             *buf = (wchar_t)(c << 8);
 544                     }
 545
 546                     ok = true;
 547                 }
 548             }
 549
 550             if ( !ok )
 551             {
 552                 // in valid UTF7 we should have valid characters after '+'
 553                 return (size_t)-1;
 554             }
 555
 556             if (*psz == '-')
 557                 psz++;
 558         }
 559     }
 560
 561     if ( buf && (len < n) )
 562         *buf = '\0';
 563
 564     return len;
 565 }
 566
 567 //
 568 // BASE64 encoding table
 569 //
 570 static const unsigned char utf7enb64[] =
 571 {
 572     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 573     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 574     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 575     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 576     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 577     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 578     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 579     '4', '5', '6', '7', '8', '9', '+', '/'
 580 };
 581
 582 //
 583 // UTF-7 encoding table
 584 //
 585 // 0 - Set D (directly encoded characters)
 586 // 1 - Set O (optional direct characters)
 587 // 2 - whitespace characters (optional)
 588 // 3 - special characters
 589 //
 590 static const unsigned char utf7encode[128] =
 591 {
 592     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 593     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 594     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 595     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 596     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 597     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 598     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 599     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 600 };
 601
 602 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 603 {
 604     size_t len = 0;
 605
 606     while (*psz && ((!buf) || (len < n)))
 607     {
 608         wchar_t cc = *psz++;
 609         if (cc < 0x80 && utf7encode[cc] < 1)
 610         {
 611             // plain ASCII char
 612             if (buf)
 613                 *buf++ = (char)cc;
 614             len++;
 615         }
 616 #ifndef WC_UTF16
 617         else if (((wxUint32)cc) > 0xffff)
 618         {
 619             // no surrogate pair generation (yet?)
 620             return (size_t)-1;
 621         }
 622 #endif
 623         else
 624         {
 625             if (buf)
 626                 *buf++ = '+';
 627             len++;
 628             if (cc != '+')
 629             {
 630                 // BASE64 encode string
 631                 unsigned int lsb, d, l;
 632                 for (d = 0, l = 0; /*nothing*/; psz++)
 633                 {
 634                     for (lsb = 0; lsb < 2; lsb ++)
 635                     {
 636                         d <<= 8;
 637                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 638
 639                         for (l += 8; l >= 6; )
 640                         {
 641                             l -= 6;
 642                             if (buf)
 643                                 *buf++ = utf7enb64[(d >> l) % 64];
 644                             len++;
 645                         }
 646                     }
 647                     cc = *psz;
 648                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 649                         break;
 650                 }
 651                 if (l != 0)
 652                 {
 653                     if (buf)
 654                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 655                     len++;
 656                 }
 657             }
 658             if (buf)
 659                 *buf++ = '-';
 660             len++;
 661         }
 662     }
 663     if (buf && (len < n))
 664         *buf = 0;
 665     return len;
 666 }
 667
 668 // ----------------------------------------------------------------------------
 669 // UTF-8
 670 // ----------------------------------------------------------------------------
 671
 672 static wxUint32 utf8_max[]=
 673     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 674
 675 // boundaries of the private use area we use to (temporarily) remap invalid
 676 // characters invalid in a UTF-8 encoded string
 677 const wxUint32 wxUnicodePUA = 0x100000;
 678 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 679
 680 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 681 {
 682     size_t len = 0;
 683
 684     while (*psz && ((!buf) || (len < n)))
 685     {
 686         const char *opsz = psz;
 687         bool invalid = false;
 688         unsigned char cc = *psz++, fc = cc;
 689         unsigned cnt;
 690         for (cnt = 0; fc & 0x80; cnt++)
 691             fc <<= 1;
 692         if (!cnt)
 693         {
 694             // plain ASCII char
 695             if (buf)
 696                 *buf++ = cc;
 697             len++;
 698
 699             // escape the escape character for octal escapes
 700             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 701                     && cc == '\\' && (!buf || len < n))
 702             {
 703                 if (buf)
 704                     *buf++ = cc;
 705                 len++;
 706             }
 707         }
 708         else
 709         {
 710             cnt--;
 711             if (!cnt)
 712             {
 713                 // invalid UTF-8 sequence
 714                 invalid = true;
 715             }
 716             else
 717             {
 718                 unsigned ocnt = cnt - 1;
 719                 wxUint32 res = cc & (0x3f >> cnt);
 720                 while (cnt--)
 721                 {
 722                     cc = *psz;
 723                     if ((cc & 0xC0) != 0x80)
 724                     {
 725                         // invalid UTF-8 sequence
 726                         invalid = true;
 727                         break;
 728                     }
 729                     psz++;
 730                     res = (res << 6) | (cc & 0x3f);
 731                 }
 732                 if (invalid || res <= utf8_max[ocnt])
 733                 {
 734                     // illegal UTF-8 encoding
 735                     invalid = true;
 736                 }
 737                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 738                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 739                 {
 740                     // if one of our PUA characters turns up externally
 741                     // it must also be treated as an illegal sequence
 742                     // (a bit like you have to escape an escape character)
 743                     invalid = true;
 744                 }
 745                 else
 746                 {
 747 #ifdef WC_UTF16
 748                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 749                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 750                     if (pa == (size_t)-1)
 751                     {
 752                         invalid = true;
 753                     }
 754                     else
 755                     {
 756                         if (buf)
 757                             buf += pa;
 758                         len += pa;
 759                     }
 760 #else // !WC_UTF16
 761                     if (buf)
 762                         *buf++ = (wchar_t)res;
 763                     len++;
 764 #endif // WC_UTF16/!WC_UTF16
 765                 }
 766             }
 767             if (invalid)
 768             {
 769                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 770                 {
 771                     while (opsz < psz && (!buf || len < n))
 772                     {
 773 #ifdef WC_UTF16
 774                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 775                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 776                         wxASSERT(pa != (size_t)-1);
 777                         if (buf)
 778                             buf += pa;
 779                         opsz++;
 780                         len += pa;
 781 #else
 782                         if (buf)
 783                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 784                         opsz++;
 785                         len++;
 786 #endif
 787                     }
 788                 }
 789                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 790                 {
 791                     while (opsz < psz && (!buf || len < n))
 792                     {
 793                         if ( buf && len + 3 < n )
 794                         {
 795                             unsigned char on = *opsz;
 796                             *buf++ = L'\\';
 797                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 798                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 799                             *buf++ = (wchar_t)( L'0' + on % 010 );
 800                         }
 801                         opsz++;
 802                         len += 4;
 803                     }
 804                 }
 805                 else // MAP_INVALID_UTF8_NOT
 806                 {
 807                     return (size_t)-1;
 808                 }
 809             }
 810         }
 811     }
 812     if (buf && (len < n))
 813         *buf = 0;
 814     return len;
 815 }
 816
 817 static inline bool isoctal(wchar_t wch)
 818 {
 819     return L'0' <= wch && wch <= L'7';
 820 }
 821
 822 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 823 {
 824     size_t len = 0;
 825
 826     while (*psz && ((!buf) || (len < n)))
 827     {
 828         wxUint32 cc;
 829 #ifdef WC_UTF16
 830         // cast is ok for WC_UTF16
 831         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 832         psz += (pa == (size_t)-1) ? 1 : pa;
 833 #else
 834         cc=(*psz++) & 0x7fffffff;
 835 #endif
 836
 837         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 838                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 839         {
 840             if (buf)
 841                 *buf++ = (char)(cc - wxUnicodePUA);
 842             len++;
 843         }
 844         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 845                     && cc == L'\\' && psz[0] == L'\\' )
 846         {
 847             if (buf)
 848                 *buf++ = (char)cc;
 849             psz++;
 850             len++;
 851         }
 852         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 853                     cc == L'\\' &&
 854                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 855         {
 856             if (buf)
 857             {
 858                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 859                                  (psz[1] - L'0')*010 +
 860                                  (psz[2] - L'0'));
 861             }
 862
 863             psz += 3;
 864             len++;
 865         }
 866         else
 867         {
 868             unsigned cnt;
 869             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 870             if (!cnt)
 871             {
 872                 // plain ASCII char
 873                 if (buf)
 874                     *buf++ = (char) cc;
 875                 len++;
 876             }
 877
 878             else
 879             {
 880                 len += cnt + 1;
 881                 if (buf)
 882                 {
 883                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 884                     while (cnt--)
 885                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 886                 }
 887             }
 888         }
 889     }
 890
 891     if (buf && (len<n))
 892         *buf = 0;
 893
 894     return len;
 895 }
 896
 897 // ----------------------------------------------------------------------------
 898 // UTF-16
 899 // ----------------------------------------------------------------------------
 900
 901 #ifdef WORDS_BIGENDIAN
 902     #define wxMBConvUTF16straight wxMBConvUTF16BE
 903     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 904 #else
 905     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 906     #define wxMBConvUTF16straight wxMBConvUTF16LE
 907 #endif
 908
 909
 910 #ifdef WC_UTF16
 911
 912 // copy 16bit MB to 16bit String
 913 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 914 {
 915     size_t len=0;
 916
 917     while (*(wxUint16*)psz && (!buf || len < n))
 918     {
 919         if (buf)
 920             *buf++ = *(wxUint16*)psz;
 921         len++;
 922
 923         psz += sizeof(wxUint16);
 924     }
 925     if (buf && len<n)   *buf=0;
 926
 927     return len;
 928 }
 929
 930
 931 // copy 16bit String to 16bit MB
 932 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 933 {
 934     size_t len=0;
 935
 936     while (*psz && (!buf || len < n))
 937     {
 938         if (buf)
 939         {
 940             *(wxUint16*)buf = *psz;
 941             buf += sizeof(wxUint16);
 942         }
 943         len += sizeof(wxUint16);
 944         psz++;
 945     }
 946     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 947
 948     return len;
 949 }
 950
 951
 952 // swap 16bit MB to 16bit String
 953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 954 {
 955     size_t len = 0;
 956
 957     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 958     // inside the string
 959     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 960     {
 961         if ( buf )
 962         {
 963             ((char *)buf)[0] = psz[1];
 964             ((char *)buf)[1] = psz[0];
 965             buf++;
 966         }
 967         len++;
 968         psz += 2;
 969     }
 970
 971     if ( buf && len < n )
 972         *buf = L'\0';
 973
 974     return len;
 975 }
 976
 977
 978 // swap 16bit MB to 16bit String
 979 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 980 {
 981     size_t len = 0;
 982
 983     while ( *psz && (!buf || len < n) )
 984     {
 985         if ( buf )
 986         {
 987             *buf++ = ((char*)psz)[1];
 988             *buf++ = ((char*)psz)[0];
 989         }
 990         len += 2;
 991         psz++;
 992     }
 993
 994     if ( buf && len < n )
 995         *buf = '\0';
 996
 997     return len;
 998 }
 999
1000
1001 #else // WC_UTF16
1002
1003
1004 // copy 16bit MB to 32bit String
1005 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1006 {
1007     size_t len=0;
1008
1009     while (*(wxUint16*)psz && (!buf || len < n))
1010     {
1011         wxUint32 cc;
1012         size_t pa=decode_utf16((wxUint16*)psz, cc);
1013         if (pa == (size_t)-1)
1014             return pa;
1015
1016         if (buf)
1017             *buf++ = (wchar_t)cc;
1018         len++;
1019         psz += pa * sizeof(wxUint16);
1020     }
1021     if (buf && len<n)   *buf=0;
1022
1023     return len;
1024 }
1025
1026
1027 // copy 32bit String to 16bit MB
1028 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1029 {
1030     size_t len=0;
1031
1032     while (*psz && (!buf || len < n))
1033     {
1034         wxUint16 cc[2];
1035         size_t pa=encode_utf16(*psz, cc);
1036
1037         if (pa == (size_t)-1)
1038             return pa;
1039
1040         if (buf)
1041         {
1042             *(wxUint16*)buf = cc[0];
1043             buf += sizeof(wxUint16);
1044             if (pa > 1)
1045             {
1046                 *(wxUint16*)buf = cc[1];
1047                 buf += sizeof(wxUint16);
1048             }
1049         }
1050
1051         len += pa*sizeof(wxUint16);
1052         psz++;
1053     }
1054     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1055
1056     return len;
1057 }
1058
1059
1060 // swap 16bit MB to 32bit String
1061 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1062 {
1063     size_t len=0;
1064
1065     while (*(wxUint16*)psz && (!buf || len < n))
1066     {
1067         wxUint32 cc;
1068         char tmp[4];
1069         tmp[0]=psz[1];  tmp[1]=psz[0];
1070         tmp[2]=psz[3];  tmp[3]=psz[2];
1071
1072         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1073         if (pa == (size_t)-1)
1074             return pa;
1075
1076         if (buf)
1077             *buf++ = (wchar_t)cc;
1078
1079         len++;
1080         psz += pa * sizeof(wxUint16);
1081     }
1082     if (buf && len<n)   *buf=0;
1083
1084     return len;
1085 }
1086
1087
1088 // swap 32bit String to 16bit MB
1089 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1090 {
1091     size_t len=0;
1092
1093     while (*psz && (!buf || len < n))
1094     {
1095         wxUint16 cc[2];
1096         size_t pa=encode_utf16(*psz, cc);
1097
1098         if (pa == (size_t)-1)
1099             return pa;
1100
1101         if (buf)
1102         {
1103             *buf++ = ((char*)cc)[1];
1104             *buf++ = ((char*)cc)[0];
1105             if (pa > 1)
1106             {
1107                 *buf++ = ((char*)cc)[3];
1108                 *buf++ = ((char*)cc)[2];
1109             }
1110         }
1111
1112         len += pa*sizeof(wxUint16);
1113         psz++;
1114     }
1115     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1116
1117     return len;
1118 }
1119
1120 #endif // WC_UTF16
1121
1122
1123 // ----------------------------------------------------------------------------
1124 // UTF-32
1125 // ----------------------------------------------------------------------------
1126
1127 #ifdef WORDS_BIGENDIAN
1128 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1129 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1130 #else
1131 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1132 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1133 #endif
1134
1135
1136 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1137 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1138
1139
1140 #ifdef WC_UTF16
1141
1142 // copy 32bit MB to 16bit String
1143 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1144 {
1145     size_t len=0;
1146
1147     while (*(wxUint32*)psz && (!buf || len < n))
1148     {
1149         wxUint16 cc[2];
1150
1151         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1152         if (pa == (size_t)-1)
1153             return pa;
1154
1155         if (buf)
1156         {
1157             *buf++ = cc[0];
1158             if (pa > 1)
1159                 *buf++ = cc[1];
1160         }
1161         len += pa;
1162         psz += sizeof(wxUint32);
1163     }
1164     if (buf && len<n)   *buf=0;
1165
1166     return len;
1167 }
1168
1169
1170 // copy 16bit String to 32bit MB
1171 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1172 {
1173     size_t len=0;
1174
1175     while (*psz && (!buf || len < n))
1176     {
1177         wxUint32 cc;
1178
1179         // cast is ok for WC_UTF16
1180         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1181         if (pa == (size_t)-1)
1182             return pa;
1183
1184         if (buf)
1185         {
1186             *(wxUint32*)buf = cc;
1187             buf += sizeof(wxUint32);
1188         }
1189         len += sizeof(wxUint32);
1190         psz += pa;
1191     }
1192
1193     if (buf && len<=n-sizeof(wxUint32))
1194         *(wxUint32*)buf=0;
1195
1196     return len;
1197 }
1198
1199
1200
1201 // swap 32bit MB to 16bit String
1202 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1203 {
1204     size_t len=0;
1205
1206     while (*(wxUint32*)psz && (!buf || len < n))
1207     {
1208         char tmp[4];
1209         tmp[0] = psz[3];   tmp[1] = psz[2];
1210         tmp[2] = psz[1];   tmp[3] = psz[0];
1211
1212
1213         wxUint16 cc[2];
1214
1215         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1216         if (pa == (size_t)-1)
1217             return pa;
1218
1219         if (buf)
1220         {
1221             *buf++ = cc[0];
1222             if (pa > 1)
1223                 *buf++ = cc[1];
1224         }
1225         len += pa;
1226         psz += sizeof(wxUint32);
1227     }
1228
1229     if (buf && len<n)
1230         *buf=0;
1231
1232     return len;
1233 }
1234
1235
1236 // swap 16bit String to 32bit MB
1237 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1238 {
1239     size_t len=0;
1240
1241     while (*psz && (!buf || len < n))
1242     {
1243         char cc[4];
1244
1245         // cast is ok for WC_UTF16
1246         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1247         if (pa == (size_t)-1)
1248             return pa;
1249
1250         if (buf)
1251         {
1252             *buf++ = cc[3];
1253             *buf++ = cc[2];
1254             *buf++ = cc[1];
1255             *buf++ = cc[0];
1256         }
1257         len += sizeof(wxUint32);
1258         psz += pa;
1259     }
1260
1261     if (buf && len<=n-sizeof(wxUint32))
1262         *(wxUint32*)buf=0;
1263
1264     return len;
1265 }
1266
1267 #else // WC_UTF16
1268
1269
1270 // copy 32bit MB to 32bit String
1271 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1272 {
1273     size_t len=0;
1274
1275     while (*(wxUint32*)psz && (!buf || len < n))
1276     {
1277         if (buf)
1278             *buf++ = (wchar_t)(*(wxUint32*)psz);
1279         len++;
1280         psz += sizeof(wxUint32);
1281     }
1282
1283     if (buf && len<n)
1284         *buf=0;
1285
1286     return len;
1287 }
1288
1289
1290 // copy 32bit String to 32bit MB
1291 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1292 {
1293     size_t len=0;
1294
1295     while (*psz && (!buf || len < n))
1296     {
1297         if (buf)
1298         {
1299             *(wxUint32*)buf = *psz;
1300             buf += sizeof(wxUint32);
1301         }
1302
1303         len += sizeof(wxUint32);
1304         psz++;
1305     }
1306
1307     if (buf && len<=n-sizeof(wxUint32))
1308         *(wxUint32*)buf=0;
1309
1310     return len;
1311 }
1312
1313
1314 // swap 32bit MB to 32bit String
1315 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1316 {
1317     size_t len=0;
1318
1319     while (*(wxUint32*)psz && (!buf || len < n))
1320     {
1321         if (buf)
1322         {
1323             ((char *)buf)[0] = psz[3];
1324             ((char *)buf)[1] = psz[2];
1325             ((char *)buf)[2] = psz[1];
1326             ((char *)buf)[3] = psz[0];
1327             buf++;
1328         }
1329         len++;
1330         psz += sizeof(wxUint32);
1331     }
1332
1333     if (buf && len<n)
1334         *buf=0;
1335
1336     return len;
1337 }
1338
1339
1340 // swap 32bit String to 32bit MB
1341 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1342 {
1343     size_t len=0;
1344
1345     while (*psz && (!buf || len < n))
1346     {
1347         if (buf)
1348         {
1349             *buf++ = ((char *)psz)[3];
1350             *buf++ = ((char *)psz)[2];
1351             *buf++ = ((char *)psz)[1];
1352             *buf++ = ((char *)psz)[0];
1353         }
1354         len += sizeof(wxUint32);
1355         psz++;
1356     }
1357
1358     if (buf && len<=n-sizeof(wxUint32))
1359         *(wxUint32*)buf=0;
1360
1361     return len;
1362 }
1363
1364
1365 #endif // WC_UTF16
1366
1367
1368 // ============================================================================
1369 // The classes doing conversion using the iconv_xxx() functions
1370 // ============================================================================
1371
1372 #ifdef HAVE_ICONV
1373
1374 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1375 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1376 //     (unless there's yet another bug in glibc) the only case when iconv()
1377 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1378 //     left in the input buffer -- when _real_ error occurs,
1379 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1380 //     iconv() failure.
1381 //     [This bug does not appear in glibc 2.2.]
1382 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1383 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1384                                      (errno != E2BIG || bufLeft != 0))
1385 #else
1386 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1387 #endif
1388
1389 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1390
1391 #define ICONV_T_INVALID ((iconv_t)-1)
1392
1393 #if SIZEOF_WCHAR_T == 4
1394     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1395     #define WC_ENC      wxFONTENCODING_UTF32
1396 #elif SIZEOF_WCHAR_T == 2
1397     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1398     #define WC_ENC      wxFONTENCODING_UTF16
1399 #else // sizeof(wchar_t) != 2 nor 4
1400     // does this ever happen?
1401     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1402 #endif
1403
1404 // ----------------------------------------------------------------------------
1405 // wxMBConv_iconv: encapsulates an iconv character set
1406 // ----------------------------------------------------------------------------
1407
1408 class wxMBConv_iconv : public wxMBConv
1409 {
1410 public:
1411     wxMBConv_iconv(const wxChar *name);
1412     virtual ~wxMBConv_iconv();
1413
1414     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1415     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1416
1417     // classify this encoding as explained in wxMBConv::GetMBNulLen()
1418     // comment
1419     virtual size_t GetMBNulLen() const;
1420
1421     bool IsOk() const
1422         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1423
1424 protected:
1425     // the iconv handlers used to translate from multibyte to wide char and in
1426     // the other direction
1427     iconv_t m2w,
1428             w2m;
1429 #if wxUSE_THREADS
1430     // guards access to m2w and w2m objects
1431     wxMutex m_iconvMutex;
1432 #endif
1433
1434 private:
1435     // the name (for iconv_open()) of a wide char charset -- if none is
1436     // available on this machine, it will remain NULL
1437     static wxString ms_wcCharsetName;
1438
1439     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1440     // different endian-ness than the native one
1441     static bool ms_wcNeedsSwap;
1442
1443     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1444     // initially
1445     size_t m_minMBCharWidth;
1446 };
1447
1448 // make the constructor available for unit testing
1449 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1450 {
1451     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1452     if ( !result->IsOk() )
1453     {
1454         delete result;
1455         return 0;
1456     }
1457     return result;
1458 }
1459
1460 wxString wxMBConv_iconv::ms_wcCharsetName;
1461 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1462
1463 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1464 {
1465     m_minMBCharWidth = 0;
1466
1467     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1468     // names for the charsets
1469     const wxCharBuffer cname(wxString(name).ToAscii());
1470
1471     // check for charset that represents wchar_t:
1472     if ( ms_wcCharsetName.empty() )
1473     {
1474         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1475
1476 #if wxUSE_FONTMAP
1477         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1478 #else // !wxUSE_FONTMAP
1479         static const wxChar *names[] =
1480         {
1481 #if SIZEOF_WCHAR_T == 4
1482             _T("UCS-4"),
1483 #elif SIZEOF_WCHAR_T = 2
1484             _T("UCS-2"),
1485 #endif
1486             NULL
1487         };
1488 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1489
1490         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1491         {
1492             const wxString nameCS(*names);
1493
1494             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1495             wxString nameXE(nameCS);
1496             #ifdef WORDS_BIGENDIAN
1497                 nameXE += _T("BE");
1498             #else // little endian
1499                 nameXE += _T("LE");
1500             #endif
1501
1502             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1503                        nameXE.c_str());
1504
1505             m2w = iconv_open(nameXE.ToAscii(), cname);
1506             if ( m2w == ICONV_T_INVALID )
1507             {
1508                 // try charset w/o bytesex info (e.g. "UCS4")
1509                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1510                            nameCS.c_str());
1511                 m2w = iconv_open(nameCS.ToAscii(), cname);
1512
1513                 // and check for bytesex ourselves:
1514                 if ( m2w != ICONV_T_INVALID )
1515                 {
1516                     char    buf[2], *bufPtr;
1517                     wchar_t wbuf[2], *wbufPtr;
1518                     size_t  insz, outsz;
1519                     size_t  res;
1520
1521                     buf[0] = 'A';
1522                     buf[1] = 0;
1523                     wbuf[0] = 0;
1524                     insz = 2;
1525                     outsz = SIZEOF_WCHAR_T * 2;
1526                     wbufPtr = wbuf;
1527                     bufPtr = buf;
1528
1529                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1530                                 (char**)&wbufPtr, &outsz);
1531
1532                     if (ICONV_FAILED(res, insz))
1533                     {
1534                         wxLogLastError(wxT("iconv"));
1535                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1536                                    nameCS.c_str());
1537                     }
1538                     else // ok, can convert to this encoding, remember it
1539                     {
1540                         ms_wcCharsetName = nameCS;
1541                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1542                     }
1543                 }
1544             }
1545             else // use charset not requiring byte swapping
1546             {
1547                 ms_wcCharsetName = nameXE;
1548             }
1549         }
1550
1551         wxLogTrace(TRACE_STRCONV,
1552                    wxT("iconv wchar_t charset is \"%s\"%s"),
1553                    ms_wcCharsetName.empty() ? _T("<none>")
1554                                             : ms_wcCharsetName.c_str(),
1555                    ms_wcNeedsSwap ? _T(" (needs swap)")
1556                                   : _T(""));
1557     }
1558     else // we already have ms_wcCharsetName
1559     {
1560         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1561     }
1562
1563     if ( ms_wcCharsetName.empty() )
1564     {
1565         w2m = ICONV_T_INVALID;
1566     }
1567     else
1568     {
1569         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1570         if ( w2m == ICONV_T_INVALID )
1571         {
1572             wxLogTrace(TRACE_STRCONV,
1573                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1574                        ms_wcCharsetName.c_str(), cname.data());
1575         }
1576     }
1577 }
1578
1579 wxMBConv_iconv::~wxMBConv_iconv()
1580 {
1581     if ( m2w != ICONV_T_INVALID )
1582         iconv_close(m2w);
1583     if ( w2m != ICONV_T_INVALID )
1584         iconv_close(w2m);
1585 }
1586
1587 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1588 {
1589     // find the string length: notice that must be done differently for
1590     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1591     size_t inbuf;
1592     const size_t nulLen = GetMBNulLen();
1593     switch ( nulLen )
1594     {
1595         default:
1596             return (size_t)-1;
1597
1598         case 1:
1599             inbuf = strlen(psz); // arguably more optimized than our version
1600             break;
1601
1602         case 2:
1603         case 4:
1604             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1605             // they also have to start at character boundary and not span two
1606             // adjacent characters
1607             const char *p;
1608             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1609                 ;
1610             inbuf = p - psz;
1611             break;
1612     }
1613
1614 #if wxUSE_THREADS
1615     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1616     //     Unfortunately there is a couple of global wxCSConv objects such as
1617     //     wxConvLocal that are used all over wx code, so we have to make sure
1618     //     the handle is used by at most one thread at the time. Otherwise
1619     //     only a few wx classes would be safe to use from non-main threads
1620     //     as MB<->WC conversion would fail "randomly".
1621     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1622 #endif // wxUSE_THREADS
1623
1624
1625     size_t outbuf = n * SIZEOF_WCHAR_T;
1626     size_t res, cres;
1627     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1628     wchar_t *bufPtr = buf;
1629     const char *pszPtr = psz;
1630
1631     if (buf)
1632     {
1633         // have destination buffer, convert there
1634         cres = iconv(m2w,
1635                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1636                      (char**)&bufPtr, &outbuf);
1637         res = n - (outbuf / SIZEOF_WCHAR_T);
1638
1639         if (ms_wcNeedsSwap)
1640         {
1641             // convert to native endianness
1642             for ( unsigned i = 0; i < res; i++ )
1643                 buf[n] = WC_BSWAP(buf[i]);
1644         }
1645
1646         // NUL-terminate the string if there is any space left
1647         if (res < n)
1648             buf[res] = 0;
1649     }
1650     else
1651     {
1652         // no destination buffer... convert using temp buffer
1653         // to calculate destination buffer requirement
1654         wchar_t tbuf[8];
1655         res = 0;
1656         do {
1657             bufPtr = tbuf;
1658             outbuf = 8*SIZEOF_WCHAR_T;
1659
1660             cres = iconv(m2w,
1661                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1662                          (char**)&bufPtr, &outbuf );
1663
1664             res += 8-(outbuf/SIZEOF_WCHAR_T);
1665         } while ((cres==(size_t)-1) && (errno==E2BIG));
1666     }
1667
1668     if (ICONV_FAILED(cres, inbuf))
1669     {
1670         //VS: it is ok if iconv fails, hence trace only
1671         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1672         return (size_t)-1;
1673     }
1674
1675     return res;
1676 }
1677
1678 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1679 {
1680 #if wxUSE_THREADS
1681     // NB: explained in MB2WC
1682     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1683 #endif
1684
1685     size_t inlen = wxWcslen(psz);
1686     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1687     size_t outbuf = n;
1688     size_t res, cres;
1689
1690     wchar_t *tmpbuf = 0;
1691
1692     if (ms_wcNeedsSwap)
1693     {
1694         // need to copy to temp buffer to switch endianness
1695         // (doing WC_BSWAP twice on the original buffer won't help, as it
1696         //  could be in read-only memory, or be accessed in some other thread)
1697         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1698         for ( size_t i = 0; i < inlen; i++ )
1699             tmpbuf[n] = WC_BSWAP(psz[i]);
1700         tmpbuf[inlen] = L'\0';
1701         psz = tmpbuf;
1702     }
1703
1704     if (buf)
1705     {
1706         // have destination buffer, convert there
1707         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1708
1709         res = n-outbuf;
1710
1711         // NB: iconv was given only wcslen(psz) characters on input, and so
1712         //     it couldn't convert the trailing zero. Let's do it ourselves
1713         //     if there's some room left for it in the output buffer.
1714         if (res < n)
1715             buf[0] = 0;
1716     }
1717     else
1718     {
1719         // no destination buffer... convert using temp buffer
1720         // to calculate destination buffer requirement
1721         char tbuf[16];
1722         res = 0;
1723         do {
1724             buf = tbuf; outbuf = 16;
1725
1726             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1727
1728             res += 16 - outbuf;
1729         } while ((cres==(size_t)-1) && (errno==E2BIG));
1730     }
1731
1732     if (ms_wcNeedsSwap)
1733     {
1734         free(tmpbuf);
1735     }
1736
1737     if (ICONV_FAILED(cres, inbuf))
1738     {
1739         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1740         return (size_t)-1;
1741     }
1742
1743     return res;
1744 }
1745
1746 size_t wxMBConv_iconv::GetMBNulLen() const
1747 {
1748     if ( m_minMBCharWidth == 0 )
1749     {
1750         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1751
1752 #if wxUSE_THREADS
1753         // NB: explained in MB2WC
1754         wxMutexLocker lock(self->m_iconvMutex);
1755 #endif
1756
1757         wchar_t *wnul = L"";
1758         char buf[8]; // should be enough for NUL in any encoding
1759         size_t inLen = sizeof(wchar_t),
1760                outLen = WXSIZEOF(buf);
1761         char *in = (char *)wnul;
1762         char *out = buf;
1763         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1764         {
1765             self->m_minMBCharWidth = (size_t)-1;
1766         }
1767         else // ok
1768         {
1769             self->m_minMBCharWidth = out - buf;
1770         }
1771     }
1772
1773     return m_minMBCharWidth;
1774 }
1775
1776 #endif // HAVE_ICONV
1777
1778
1779 // ============================================================================
1780 // Win32 conversion classes
1781 // ============================================================================
1782
1783 #ifdef wxHAVE_WIN32_MB2WC
1784
1785 // from utils.cpp
1786 #if wxUSE_FONTMAP
1787 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1788 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1789 #endif
1790
1791 class wxMBConv_win32 : public wxMBConv
1792 {
1793 public:
1794     wxMBConv_win32()
1795     {
1796         m_CodePage = CP_ACP;
1797         m_minMBCharWidth = 0;
1798     }
1799
1800 #if wxUSE_FONTMAP
1801     wxMBConv_win32(const wxChar* name)
1802     {
1803         m_CodePage = wxCharsetToCodepage(name);
1804         m_minMBCharWidth = 0;
1805     }
1806
1807     wxMBConv_win32(wxFontEncoding encoding)
1808     {
1809         m_CodePage = wxEncodingToCodepage(encoding);
1810         m_minMBCharWidth = 0;
1811     }
1812 #endif // wxUSE_FONTMAP
1813
1814     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1815     {
1816         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1817         // the behaviour is not compatible with the Unix version (using iconv)
1818         // and break the library itself, e.g. wxTextInputStream::NextChar()
1819         // wouldn't work if reading an incomplete MB char didn't result in an
1820         // error
1821         //
1822         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1823         // Win XP or newer and it is not supported for UTF-[78] so we always
1824         // use our own conversions in this case. See
1825         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1826         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1827         if ( m_CodePage == CP_UTF8 )
1828         {
1829             return wxConvUTF8.MB2WC(buf, psz, n);
1830         }
1831
1832         if ( m_CodePage == CP_UTF7 )
1833         {
1834             return wxConvUTF7.MB2WC(buf, psz, n);
1835         }
1836
1837         int flags = 0;
1838         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1839                 IsAtLeastWin2kSP4() )
1840         {
1841             flags = MB_ERR_INVALID_CHARS;
1842         }
1843
1844         const size_t len = ::MultiByteToWideChar
1845                              (
1846                                 m_CodePage,     // code page
1847                                 flags,          // flags: fall on error
1848                                 psz,            // input string
1849                                 -1,             // its length (NUL-terminated)
1850                                 buf,            // output string
1851                                 buf ? n : 0     // size of output buffer
1852                              );
1853         if ( !len )
1854         {
1855             // function totally failed
1856             return (size_t)-1;
1857         }
1858
1859         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1860         // check if we succeeded, by doing a double trip:
1861         if ( !flags && buf )
1862         {
1863             const size_t mbLen = strlen(psz);
1864             wxCharBuffer mbBuf(mbLen);
1865             if ( ::WideCharToMultiByte
1866                    (
1867                       m_CodePage,
1868                       0,
1869                       buf,
1870                       -1,
1871                       mbBuf.data(),
1872                       mbLen + 1,        // size in bytes, not length
1873                       NULL,
1874                       NULL
1875                    ) == 0 ||
1876                   strcmp(mbBuf, psz) != 0 )
1877             {
1878                 // we didn't obtain the same thing we started from, hence
1879                 // the conversion was lossy and we consider that it failed
1880                 return (size_t)-1;
1881             }
1882         }
1883
1884         // note that it returns count of written chars for buf != NULL and size
1885         // of the needed buffer for buf == NULL so in either case the length of
1886         // the string (which never includes the terminating NUL) is one less
1887         return len - 1;
1888     }
1889
1890     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1891     {
1892         /*
1893             we have a problem here: by default, WideCharToMultiByte() may
1894             replace characters unrepresentable in the target code page with bad
1895             quality approximations such as turning "1/2" symbol (U+00BD) into
1896             "1" for the code pages which don't have it and we, obviously, want
1897             to avoid this at any price
1898
1899             the trouble is that this function does it _silently_, i.e. it won't
1900             even tell us whether it did or not... Win98/2000 and higher provide
1901             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1902             we have to resort to a round trip, i.e. check that converting back
1903             results in the same string -- this is, of course, expensive but
1904             otherwise we simply can't be sure to not garble the data.
1905          */
1906
1907         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1908         // it doesn't work with CJK encodings (which we test for rather roughly
1909         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1910         // supporting it
1911         BOOL usedDef wxDUMMY_INITIALIZE(false);
1912         BOOL *pUsedDef;
1913         int flags;
1914         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1915         {
1916             // it's our lucky day
1917             flags = WC_NO_BEST_FIT_CHARS;
1918             pUsedDef = &usedDef;
1919         }
1920         else // old system or unsupported encoding
1921         {
1922             flags = 0;
1923             pUsedDef = NULL;
1924         }
1925
1926         const size_t len = ::WideCharToMultiByte
1927                              (
1928                                 m_CodePage,     // code page
1929                                 flags,          // either none or no best fit
1930                                 pwz,            // input string
1931                                 -1,             // it is (wide) NUL-terminated
1932                                 buf,            // output buffer
1933                                 buf ? n : 0,    // and its size
1934                                 NULL,           // default "replacement" char
1935                                 pUsedDef        // [out] was it used?
1936                              );
1937
1938         if ( !len )
1939         {
1940             // function totally failed
1941             return (size_t)-1;
1942         }
1943
1944         // if we were really converting, check if we succeeded
1945         if ( buf )
1946         {
1947             if ( flags )
1948             {
1949                 // check if the conversion failed, i.e. if any replacements
1950                 // were done
1951                 if ( usedDef )
1952                     return (size_t)-1;
1953             }
1954             else // we must resort to double tripping...
1955             {
1956                 wxWCharBuffer wcBuf(n);
1957                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1958                         wcscmp(wcBuf, pwz) != 0 )
1959                 {
1960                     // we didn't obtain the same thing we started from, hence
1961                     // the conversion was lossy and we consider that it failed
1962                     return (size_t)-1;
1963                 }
1964             }
1965         }
1966
1967         // see the comment above for the reason of "len - 1"
1968         return len - 1;
1969     }
1970
1971     virtual size_t GetMBNulLen() const
1972     {
1973         if ( m_minMBCharWidth == 0 )
1974         {
1975             int len = ::WideCharToMultiByte
1976                         (
1977                             m_CodePage,     // code page
1978                             0,              // no flags
1979                             L"",            // input string
1980                             1,              // translate just the NUL
1981                             NULL,           // output buffer
1982                             0,              // and its size
1983                             NULL,           // no replacement char
1984                             NULL            // [out] don't care if it was used
1985                         );
1986
1987             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1988             switch ( len )
1989             {
1990                 default:
1991                     wxLogDebug(_T("Unexpected NUL length %d"), len);
1992                     // fall through
1993
1994                 case 0:
1995                     self->m_minMBCharWidth = (size_t)-1;
1996                     break;
1997
1998                 case 1:
1999                 case 2:
2000                 case 4:
2001                     self->m_minMBCharWidth = len;
2002                     break;
2003             }
2004         }
2005
2006         return m_minMBCharWidth;
2007     }
2008
2009     bool IsOk() const { return m_CodePage != -1; }
2010
2011 private:
2012     static bool CanUseNoBestFit()
2013     {
2014         static int s_isWin98Or2k = -1;
2015
2016         if ( s_isWin98Or2k == -1 )
2017         {
2018             int verMaj, verMin;
2019             switch ( wxGetOsVersion(&verMaj, &verMin) )
2020             {
2021                 case wxWIN95:
2022                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2023                     break;
2024
2025                 case wxWINDOWS_NT:
2026                     s_isWin98Or2k = verMaj >= 5;
2027                     break;
2028
2029                 default:
2030                     // unknown, be conseravtive by default
2031                     s_isWin98Or2k = 0;
2032             }
2033
2034             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2035         }
2036
2037         return s_isWin98Or2k == 1;
2038     }
2039
2040     static bool IsAtLeastWin2kSP4()
2041     {
2042 #ifdef __WXWINCE__
2043         return false;
2044 #else
2045         static int s_isAtLeastWin2kSP4 = -1;
2046
2047         if ( s_isAtLeastWin2kSP4 == -1 )
2048         {
2049             OSVERSIONINFOEX ver;
2050
2051             memset(&ver, 0, sizeof(ver));
2052             ver.dwOSVersionInfoSize = sizeof(ver);
2053             GetVersionEx((OSVERSIONINFO*)&ver);
2054
2055             s_isAtLeastWin2kSP4 =
2056               ((ver.dwMajorVersion > 5) || // Vista+
2057                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2058                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2059                ver.wServicePackMajor >= 4)) // 2000 SP4+
2060               ? 1 : 0;
2061         }
2062
2063         return s_isAtLeastWin2kSP4 == 1;
2064 #endif
2065     }
2066
2067
2068     // the code page we're working with
2069     long m_CodePage;
2070
2071     // cached result of GetMBNulLen(), set to 0 initially meaning
2072     // "unknown"
2073     size_t m_minMBCharWidth;
2074 };
2075
2076 #endif // wxHAVE_WIN32_MB2WC
2077
2078 // ============================================================================
2079 // Cocoa conversion classes
2080 // ============================================================================
2081
2082 #if defined(__WXCOCOA__)
2083
2084 // RN:  There is no UTF-32 support in either Core Foundation or
2085 // Cocoa.  Strangely enough, internally Core Foundation uses
2086 // UTF 32 internally quite a bit - its just not public (yet).
2087
2088 #include <CoreFoundation/CFString.h>
2089 #include <CoreFoundation/CFStringEncodingExt.h>
2090
2091 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2092 {
2093     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2094     if ( encoding == wxFONTENCODING_DEFAULT )
2095     {
2096         enc = CFStringGetSystemEncoding();
2097     }
2098     else switch( encoding)
2099     {
2100         case wxFONTENCODING_ISO8859_1 :
2101             enc = kCFStringEncodingISOLatin1 ;
2102             break ;
2103         case wxFONTENCODING_ISO8859_2 :
2104             enc = kCFStringEncodingISOLatin2;
2105             break ;
2106         case wxFONTENCODING_ISO8859_3 :
2107             enc = kCFStringEncodingISOLatin3 ;
2108             break ;
2109         case wxFONTENCODING_ISO8859_4 :
2110             enc = kCFStringEncodingISOLatin4;
2111             break ;
2112         case wxFONTENCODING_ISO8859_5 :
2113             enc = kCFStringEncodingISOLatinCyrillic;
2114             break ;
2115         case wxFONTENCODING_ISO8859_6 :
2116             enc = kCFStringEncodingISOLatinArabic;
2117             break ;
2118         case wxFONTENCODING_ISO8859_7 :
2119             enc = kCFStringEncodingISOLatinGreek;
2120             break ;
2121         case wxFONTENCODING_ISO8859_8 :
2122             enc = kCFStringEncodingISOLatinHebrew;
2123             break ;
2124         case wxFONTENCODING_ISO8859_9 :
2125             enc = kCFStringEncodingISOLatin5;
2126             break ;
2127         case wxFONTENCODING_ISO8859_10 :
2128             enc = kCFStringEncodingISOLatin6;
2129             break ;
2130         case wxFONTENCODING_ISO8859_11 :
2131             enc = kCFStringEncodingISOLatinThai;
2132             break ;
2133         case wxFONTENCODING_ISO8859_13 :
2134             enc = kCFStringEncodingISOLatin7;
2135             break ;
2136         case wxFONTENCODING_ISO8859_14 :
2137             enc = kCFStringEncodingISOLatin8;
2138             break ;
2139         case wxFONTENCODING_ISO8859_15 :
2140             enc = kCFStringEncodingISOLatin9;
2141             break ;
2142
2143         case wxFONTENCODING_KOI8 :
2144             enc = kCFStringEncodingKOI8_R;
2145             break ;
2146         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2147             enc = kCFStringEncodingDOSRussian;
2148             break ;
2149
2150 //      case wxFONTENCODING_BULGARIAN :
2151 //          enc = ;
2152 //          break ;
2153
2154         case wxFONTENCODING_CP437 :
2155             enc =kCFStringEncodingDOSLatinUS ;
2156             break ;
2157         case wxFONTENCODING_CP850 :
2158             enc = kCFStringEncodingDOSLatin1;
2159             break ;
2160         case wxFONTENCODING_CP852 :
2161             enc = kCFStringEncodingDOSLatin2;
2162             break ;
2163         case wxFONTENCODING_CP855 :
2164             enc = kCFStringEncodingDOSCyrillic;
2165             break ;
2166         case wxFONTENCODING_CP866 :
2167             enc =kCFStringEncodingDOSRussian ;
2168             break ;
2169         case wxFONTENCODING_CP874 :
2170             enc = kCFStringEncodingDOSThai;
2171             break ;
2172         case wxFONTENCODING_CP932 :
2173             enc = kCFStringEncodingDOSJapanese;
2174             break ;
2175         case wxFONTENCODING_CP936 :
2176             enc =kCFStringEncodingDOSChineseSimplif ;
2177             break ;
2178         case wxFONTENCODING_CP949 :
2179             enc = kCFStringEncodingDOSKorean;
2180             break ;
2181         case wxFONTENCODING_CP950 :
2182             enc = kCFStringEncodingDOSChineseTrad;
2183             break ;
2184         case wxFONTENCODING_CP1250 :
2185             enc = kCFStringEncodingWindowsLatin2;
2186             break ;
2187         case wxFONTENCODING_CP1251 :
2188             enc =kCFStringEncodingWindowsCyrillic ;
2189             break ;
2190         case wxFONTENCODING_CP1252 :
2191             enc =kCFStringEncodingWindowsLatin1 ;
2192             break ;
2193         case wxFONTENCODING_CP1253 :
2194             enc = kCFStringEncodingWindowsGreek;
2195             break ;
2196         case wxFONTENCODING_CP1254 :
2197             enc = kCFStringEncodingWindowsLatin5;
2198             break ;
2199         case wxFONTENCODING_CP1255 :
2200             enc =kCFStringEncodingWindowsHebrew ;
2201             break ;
2202         case wxFONTENCODING_CP1256 :
2203             enc =kCFStringEncodingWindowsArabic ;
2204             break ;
2205         case wxFONTENCODING_CP1257 :
2206             enc = kCFStringEncodingWindowsBalticRim;
2207             break ;
2208 //   This only really encodes to UTF7 (if that) evidently
2209 //        case wxFONTENCODING_UTF7 :
2210 //            enc = kCFStringEncodingNonLossyASCII ;
2211 //            break ;
2212         case wxFONTENCODING_UTF8 :
2213             enc = kCFStringEncodingUTF8 ;
2214             break ;
2215         case wxFONTENCODING_EUC_JP :
2216             enc = kCFStringEncodingEUC_JP;
2217             break ;
2218         case wxFONTENCODING_UTF16 :
2219             enc = kCFStringEncodingUnicode ;
2220             break ;
2221         case wxFONTENCODING_MACROMAN :
2222             enc = kCFStringEncodingMacRoman ;
2223             break ;
2224         case wxFONTENCODING_MACJAPANESE :
2225             enc = kCFStringEncodingMacJapanese ;
2226             break ;
2227         case wxFONTENCODING_MACCHINESETRAD :
2228             enc = kCFStringEncodingMacChineseTrad ;
2229             break ;
2230         case wxFONTENCODING_MACKOREAN :
2231             enc = kCFStringEncodingMacKorean ;
2232             break ;
2233         case wxFONTENCODING_MACARABIC :
2234             enc = kCFStringEncodingMacArabic ;
2235             break ;
2236         case wxFONTENCODING_MACHEBREW :
2237             enc = kCFStringEncodingMacHebrew ;
2238             break ;
2239         case wxFONTENCODING_MACGREEK :
2240             enc = kCFStringEncodingMacGreek ;
2241             break ;
2242         case wxFONTENCODING_MACCYRILLIC :
2243             enc = kCFStringEncodingMacCyrillic ;
2244             break ;
2245         case wxFONTENCODING_MACDEVANAGARI :
2246             enc = kCFStringEncodingMacDevanagari ;
2247             break ;
2248         case wxFONTENCODING_MACGURMUKHI :
2249             enc = kCFStringEncodingMacGurmukhi ;
2250             break ;
2251         case wxFONTENCODING_MACGUJARATI :
2252             enc = kCFStringEncodingMacGujarati ;
2253             break ;
2254         case wxFONTENCODING_MACORIYA :
2255             enc = kCFStringEncodingMacOriya ;
2256             break ;
2257         case wxFONTENCODING_MACBENGALI :
2258             enc = kCFStringEncodingMacBengali ;
2259             break ;
2260         case wxFONTENCODING_MACTAMIL :
2261             enc = kCFStringEncodingMacTamil ;
2262             break ;
2263         case wxFONTENCODING_MACTELUGU :
2264             enc = kCFStringEncodingMacTelugu ;
2265             break ;
2266         case wxFONTENCODING_MACKANNADA :
2267             enc = kCFStringEncodingMacKannada ;
2268             break ;
2269         case wxFONTENCODING_MACMALAJALAM :
2270             enc = kCFStringEncodingMacMalayalam ;
2271             break ;
2272         case wxFONTENCODING_MACSINHALESE :
2273             enc = kCFStringEncodingMacSinhalese ;
2274             break ;
2275         case wxFONTENCODING_MACBURMESE :
2276             enc = kCFStringEncodingMacBurmese ;
2277             break ;
2278         case wxFONTENCODING_MACKHMER :
2279             enc = kCFStringEncodingMacKhmer ;
2280             break ;
2281         case wxFONTENCODING_MACTHAI :
2282             enc = kCFStringEncodingMacThai ;
2283             break ;
2284         case wxFONTENCODING_MACLAOTIAN :
2285             enc = kCFStringEncodingMacLaotian ;
2286             break ;
2287         case wxFONTENCODING_MACGEORGIAN :
2288             enc = kCFStringEncodingMacGeorgian ;
2289             break ;
2290         case wxFONTENCODING_MACARMENIAN :
2291             enc = kCFStringEncodingMacArmenian ;
2292             break ;
2293         case wxFONTENCODING_MACCHINESESIMP :
2294             enc = kCFStringEncodingMacChineseSimp ;
2295             break ;
2296         case wxFONTENCODING_MACTIBETAN :
2297             enc = kCFStringEncodingMacTibetan ;
2298             break ;
2299         case wxFONTENCODING_MACMONGOLIAN :
2300             enc = kCFStringEncodingMacMongolian ;
2301             break ;
2302         case wxFONTENCODING_MACETHIOPIC :
2303             enc = kCFStringEncodingMacEthiopic ;
2304             break ;
2305         case wxFONTENCODING_MACCENTRALEUR :
2306             enc = kCFStringEncodingMacCentralEurRoman ;
2307             break ;
2308         case wxFONTENCODING_MACVIATNAMESE :
2309             enc = kCFStringEncodingMacVietnamese ;
2310             break ;
2311         case wxFONTENCODING_MACARABICEXT :
2312             enc = kCFStringEncodingMacExtArabic ;
2313             break ;
2314         case wxFONTENCODING_MACSYMBOL :
2315             enc = kCFStringEncodingMacSymbol ;
2316             break ;
2317         case wxFONTENCODING_MACDINGBATS :
2318             enc = kCFStringEncodingMacDingbats ;
2319             break ;
2320         case wxFONTENCODING_MACTURKISH :
2321             enc = kCFStringEncodingMacTurkish ;
2322             break ;
2323         case wxFONTENCODING_MACCROATIAN :
2324             enc = kCFStringEncodingMacCroatian ;
2325             break ;
2326         case wxFONTENCODING_MACICELANDIC :
2327             enc = kCFStringEncodingMacIcelandic ;
2328             break ;
2329         case wxFONTENCODING_MACROMANIAN :
2330             enc = kCFStringEncodingMacRomanian ;
2331             break ;
2332         case wxFONTENCODING_MACCELTIC :
2333             enc = kCFStringEncodingMacCeltic ;
2334             break ;
2335         case wxFONTENCODING_MACGAELIC :
2336             enc = kCFStringEncodingMacGaelic ;
2337             break ;
2338 //      case wxFONTENCODING_MACKEYBOARD :
2339 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2340 //          break ;
2341         default :
2342             // because gcc is picky
2343             break ;
2344     } ;
2345     return enc ;
2346 }
2347
2348 class wxMBConv_cocoa : public wxMBConv
2349 {
2350 public:
2351     wxMBConv_cocoa()
2352     {
2353         Init(CFStringGetSystemEncoding()) ;
2354     }
2355
2356 #if wxUSE_FONTMAP
2357     wxMBConv_cocoa(const wxChar* name)
2358     {
2359         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2360     }
2361 #endif
2362
2363     wxMBConv_cocoa(wxFontEncoding encoding)
2364     {
2365         Init( wxCFStringEncFromFontEnc(encoding) );
2366     }
2367
2368     ~wxMBConv_cocoa()
2369     {
2370     }
2371
2372     void Init( CFStringEncoding encoding)
2373     {
2374         m_encoding = encoding ;
2375     }
2376
2377     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2378     {
2379         wxASSERT(szUnConv);
2380
2381         CFStringRef theString = CFStringCreateWithBytes (
2382                                                 NULL, //the allocator
2383                                                 (const UInt8*)szUnConv,
2384                                                 strlen(szUnConv),
2385                                                 m_encoding,
2386                                                 false //no BOM/external representation
2387                                                 );
2388
2389         wxASSERT(theString);
2390
2391         size_t nOutLength = CFStringGetLength(theString);
2392
2393         if (szOut == NULL)
2394         {
2395             CFRelease(theString);
2396             return nOutLength;
2397         }
2398
2399         CFRange theRange = { 0, nOutSize };
2400
2401 #if SIZEOF_WCHAR_T == 4
2402         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2403 #endif
2404
2405         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2406
2407         CFRelease(theString);
2408
2409         szUniCharBuffer[nOutLength] = '\0' ;
2410
2411 #if SIZEOF_WCHAR_T == 4
2412         wxMBConvUTF16 converter ;
2413         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2414         delete[] szUniCharBuffer;
2415 #endif
2416
2417         return nOutLength;
2418     }
2419
2420     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2421     {
2422         wxASSERT(szUnConv);
2423
2424         size_t nRealOutSize;
2425         size_t nBufSize = wxWcslen(szUnConv);
2426         UniChar* szUniBuffer = (UniChar*) szUnConv;
2427
2428 #if SIZEOF_WCHAR_T == 4
2429         wxMBConvUTF16 converter ;
2430         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2431         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2432         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2433         nBufSize /= sizeof(UniChar);
2434 #endif
2435
2436         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2437                                 NULL, //allocator
2438                                 szUniBuffer,
2439                                 nBufSize,
2440                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2441                             );
2442
2443         wxASSERT(theString);
2444
2445         //Note that CER puts a BOM when converting to unicode
2446         //so we  check and use getchars instead in that case
2447         if (m_encoding == kCFStringEncodingUnicode)
2448         {
2449             if (szOut != NULL)
2450                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2451
2452             nRealOutSize = CFStringGetLength(theString) + 1;
2453         }
2454         else
2455         {
2456             CFStringGetBytes(
2457                 theString,
2458                 CFRangeMake(0, CFStringGetLength(theString)),
2459                 m_encoding,
2460                 0, //what to put in characters that can't be converted -
2461                     //0 tells CFString to return NULL if it meets such a character
2462                 false, //not an external representation
2463                 (UInt8*) szOut,
2464                 nOutSize,
2465                 (CFIndex*) &nRealOutSize
2466                         );
2467         }
2468
2469         CFRelease(theString);
2470
2471 #if SIZEOF_WCHAR_T == 4
2472         delete[] szUniBuffer;
2473 #endif
2474
2475         return  nRealOutSize - 1;
2476     }
2477
2478     bool IsOk() const
2479     {
2480         return m_encoding != kCFStringEncodingInvalidId &&
2481               CFStringIsEncodingAvailable(m_encoding);
2482     }
2483
2484 private:
2485     CFStringEncoding m_encoding ;
2486 };
2487
2488 #endif // defined(__WXCOCOA__)
2489
2490 // ============================================================================
2491 // Mac conversion classes
2492 // ============================================================================
2493
2494 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2495
2496 class wxMBConv_mac : public wxMBConv
2497 {
2498 public:
2499     wxMBConv_mac()
2500     {
2501         Init(CFStringGetSystemEncoding()) ;
2502     }
2503
2504 #if wxUSE_FONTMAP
2505     wxMBConv_mac(const wxChar* name)
2506     {
2507         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2508     }
2509 #endif
2510
2511     wxMBConv_mac(wxFontEncoding encoding)
2512     {
2513         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2514     }
2515
2516     ~wxMBConv_mac()
2517     {
2518         OSStatus status = noErr ;
2519         status = TECDisposeConverter(m_MB2WC_converter);
2520         status = TECDisposeConverter(m_WC2MB_converter);
2521     }
2522
2523
2524     void Init( TextEncodingBase encoding)
2525     {
2526         OSStatus status = noErr ;
2527         m_char_encoding = encoding ;
2528         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2529
2530         status = TECCreateConverter(&m_MB2WC_converter,
2531                                     m_char_encoding,
2532                                     m_unicode_encoding);
2533         status = TECCreateConverter(&m_WC2MB_converter,
2534                                     m_unicode_encoding,
2535                                     m_char_encoding);
2536     }
2537
2538     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2539     {
2540         OSStatus status = noErr ;
2541         ByteCount byteOutLen ;
2542         ByteCount byteInLen = strlen(psz) ;
2543         wchar_t *tbuf = NULL ;
2544         UniChar* ubuf = NULL ;
2545         size_t res = 0 ;
2546
2547         if (buf == NULL)
2548         {
2549             //apple specs say at least 32
2550             n = wxMax( 32 , byteInLen ) ;
2551             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2552         }
2553         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2554 #if SIZEOF_WCHAR_T == 4
2555         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2556 #else
2557         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2558 #endif
2559         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2560           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2561 #if SIZEOF_WCHAR_T == 4
2562         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2563         // is not properly terminated we get random characters at the end
2564         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2565         wxMBConvUTF16 converter ;
2566         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2567         free( ubuf ) ;
2568 #else
2569         res = byteOutLen / sizeof( UniChar ) ;
2570 #endif
2571         if ( buf == NULL )
2572              free(tbuf) ;
2573
2574         if ( buf  && res < n)
2575             buf[res] = 0;
2576
2577         return res ;
2578     }
2579
2580     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2581     {
2582         OSStatus status = noErr ;
2583         ByteCount byteOutLen ;
2584         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2585
2586         char *tbuf = NULL ;
2587
2588         if (buf == NULL)
2589         {
2590             //apple specs say at least 32
2591             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2592             tbuf = (char*) malloc( n ) ;
2593         }
2594
2595         ByteCount byteBufferLen = n ;
2596         UniChar* ubuf = NULL ;
2597 #if SIZEOF_WCHAR_T == 4
2598         wxMBConvUTF16 converter ;
2599         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2600         byteInLen = unicharlen ;
2601         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2602         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2603 #else
2604         ubuf = (UniChar*) psz ;
2605 #endif
2606         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2607             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2608 #if SIZEOF_WCHAR_T == 4
2609         free( ubuf ) ;
2610 #endif
2611         if ( buf == NULL )
2612             free(tbuf) ;
2613
2614         size_t res = byteOutLen ;
2615         if ( buf  && res < n)
2616         {
2617             buf[res] = 0;
2618
2619             //we need to double-trip to verify it didn't insert any ? in place
2620             //of bogus characters
2621             wxWCharBuffer wcBuf(n);
2622             size_t pszlen = wxWcslen(psz);
2623             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2624                         wxWcslen(wcBuf) != pszlen ||
2625                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2626             {
2627                 // we didn't obtain the same thing we started from, hence
2628                 // the conversion was lossy and we consider that it failed
2629                 return (size_t)-1;
2630             }
2631         }
2632
2633         return res ;
2634     }
2635
2636     bool IsOk() const
2637         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2638
2639 private:
2640     TECObjectRef m_MB2WC_converter ;
2641     TECObjectRef m_WC2MB_converter ;
2642
2643     TextEncodingBase m_char_encoding ;
2644     TextEncodingBase m_unicode_encoding ;
2645 };
2646
2647 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2648
2649 // ============================================================================
2650 // wxEncodingConverter based conversion classes
2651 // ============================================================================
2652
2653 #if wxUSE_FONTMAP
2654
2655 class wxMBConv_wxwin : public wxMBConv
2656 {
2657 private:
2658     void Init()
2659     {
2660         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2661                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2662     }
2663
2664 public:
2665     // temporarily just use wxEncodingConverter stuff,
2666     // so that it works while a better implementation is built
2667     wxMBConv_wxwin(const wxChar* name)
2668     {
2669         if (name)
2670             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2671         else
2672             m_enc = wxFONTENCODING_SYSTEM;
2673
2674         Init();
2675     }
2676
2677     wxMBConv_wxwin(wxFontEncoding enc)
2678     {
2679         m_enc = enc;
2680
2681         Init();
2682     }
2683
2684     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2685     {
2686         size_t inbuf = strlen(psz);
2687         if (buf)
2688         {
2689             if (!m2w.Convert(psz,buf))
2690                 return (size_t)-1;
2691         }
2692         return inbuf;
2693     }
2694
2695     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2696     {
2697         const size_t inbuf = wxWcslen(psz);
2698         if (buf)
2699         {
2700             if (!w2m.Convert(psz,buf))
2701                 return (size_t)-1;
2702         }
2703
2704         return inbuf;
2705     }
2706
2707     virtual size_t GetMBNulLen() const
2708     {
2709         switch ( m_enc )
2710         {
2711             case wxFONTENCODING_UTF16BE:
2712             case wxFONTENCODING_UTF16LE:
2713                 return 2;
2714
2715             case wxFONTENCODING_UTF32BE:
2716             case wxFONTENCODING_UTF32LE:
2717                 return 4;
2718
2719             default:
2720                 return 1;
2721         }
2722     }
2723
2724     bool IsOk() const { return m_ok; }
2725
2726 public:
2727     wxFontEncoding m_enc;
2728     wxEncodingConverter m2w, w2m;
2729
2730 private:
2731     // were we initialized successfully?
2732     bool m_ok;
2733
2734     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2735 };
2736
2737 // make the constructors available for unit testing
2738 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2739 {
2740     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2741     if ( !result->IsOk() )
2742     {
2743         delete result;
2744         return 0;
2745     }
2746     return result;
2747 }
2748
2749 #endif // wxUSE_FONTMAP
2750
2751 // ============================================================================
2752 // wxCSConv implementation
2753 // ============================================================================
2754
2755 void wxCSConv::Init()
2756 {
2757     m_name = NULL;
2758     m_convReal =  NULL;
2759     m_deferred = true;
2760 }
2761
2762 wxCSConv::wxCSConv(const wxChar *charset)
2763 {
2764     Init();
2765
2766     if ( charset )
2767     {
2768         SetName(charset);
2769     }
2770
2771 #if wxUSE_FONTMAP
2772     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2773 #else
2774     m_encoding = wxFONTENCODING_SYSTEM;
2775 #endif
2776 }
2777
2778 wxCSConv::wxCSConv(wxFontEncoding encoding)
2779 {
2780     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2781     {
2782         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2783
2784         encoding = wxFONTENCODING_SYSTEM;
2785     }
2786
2787     Init();
2788
2789     m_encoding = encoding;
2790 }
2791
2792 wxCSConv::~wxCSConv()
2793 {
2794     Clear();
2795 }
2796
2797 wxCSConv::wxCSConv(const wxCSConv& conv)
2798         : wxMBConv()
2799 {
2800     Init();
2801
2802     SetName(conv.m_name);
2803     m_encoding = conv.m_encoding;
2804 }
2805
2806 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2807 {
2808     Clear();
2809
2810     SetName(conv.m_name);
2811     m_encoding = conv.m_encoding;
2812
2813     return *this;
2814 }
2815
2816 void wxCSConv::Clear()
2817 {
2818     free(m_name);
2819     delete m_convReal;
2820
2821     m_name = NULL;
2822     m_convReal = NULL;
2823 }
2824
2825 void wxCSConv::SetName(const wxChar *charset)
2826 {
2827     if (charset)
2828     {
2829         m_name = wxStrdup(charset);
2830         m_deferred = true;
2831     }
2832 }
2833
2834 #if wxUSE_FONTMAP
2835 #include "wx/hashmap.h"
2836
2837 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2838                      wxEncodingNameCache );
2839
2840 static wxEncodingNameCache gs_nameCache;
2841 #endif
2842
2843 wxMBConv *wxCSConv::DoCreate() const
2844 {
2845 #if wxUSE_FONTMAP
2846     wxLogTrace(TRACE_STRCONV,
2847                wxT("creating conversion for %s"),
2848                (m_name ? m_name
2849                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2850 #endif // wxUSE_FONTMAP
2851
2852     // check for the special case of ASCII or ISO8859-1 charset: as we have
2853     // special knowledge of it anyhow, we don't need to create a special
2854     // conversion object
2855     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2856             m_encoding == wxFONTENCODING_DEFAULT )
2857     {
2858         // don't convert at all
2859         return NULL;
2860     }
2861
2862     // we trust OS to do conversion better than we can so try external
2863     // conversion methods first
2864     //
2865     // the full order is:
2866     //      1. OS conversion (iconv() under Unix or Win32 API)
2867     //      2. hard coded conversions for UTF
2868     //      3. wxEncodingConverter as fall back
2869
2870     // step (1)
2871 #ifdef HAVE_ICONV
2872 #if !wxUSE_FONTMAP
2873     if ( m_name )
2874 #endif // !wxUSE_FONTMAP
2875     {
2876         wxString name(m_name);
2877         wxFontEncoding encoding(m_encoding);
2878
2879         if ( !name.empty() )
2880         {
2881             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2882             if ( conv->IsOk() )
2883                 return conv;
2884
2885             delete conv;
2886
2887 #if wxUSE_FONTMAP
2888             encoding =
2889                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2890 #endif // wxUSE_FONTMAP
2891         }
2892 #if wxUSE_FONTMAP
2893         {
2894             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2895             if ( it != gs_nameCache.end() )
2896             {
2897                 if ( it->second.empty() )
2898                     return NULL;
2899
2900                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2901                 if ( conv->IsOk() )
2902                     return conv;
2903
2904                 delete conv;
2905             }
2906
2907             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2908
2909             for ( ; *names; ++names )
2910             {
2911                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2912                 if ( conv->IsOk() )
2913                 {
2914                     gs_nameCache[encoding] = *names;
2915                     return conv;
2916                 }
2917
2918                 delete conv;
2919             }
2920
2921             gs_nameCache[encoding] = _T(""); // cache the failure
2922         }
2923 #endif // wxUSE_FONTMAP
2924     }
2925 #endif // HAVE_ICONV
2926
2927 #ifdef wxHAVE_WIN32_MB2WC
2928     {
2929 #if wxUSE_FONTMAP
2930         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2931                                       : new wxMBConv_win32(m_encoding);
2932         if ( conv->IsOk() )
2933             return conv;
2934
2935         delete conv;
2936 #else
2937         return NULL;
2938 #endif
2939     }
2940 #endif // wxHAVE_WIN32_MB2WC
2941 #if defined(__WXMAC__)
2942     {
2943         // leave UTF16 and UTF32 to the built-ins of wx
2944         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2945             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2946         {
2947
2948 #if wxUSE_FONTMAP
2949             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2950                                         : new wxMBConv_mac(m_encoding);
2951 #else
2952             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2953 #endif
2954             if ( conv->IsOk() )
2955                  return conv;
2956
2957             delete conv;
2958         }
2959     }
2960 #endif
2961 #if defined(__WXCOCOA__)
2962     {
2963         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2964         {
2965
2966 #if wxUSE_FONTMAP
2967             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2968                                           : new wxMBConv_cocoa(m_encoding);
2969 #else
2970             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2971 #endif
2972             if ( conv->IsOk() )
2973                  return conv;
2974
2975             delete conv;
2976         }
2977     }
2978 #endif
2979     // step (2)
2980     wxFontEncoding enc = m_encoding;
2981 #if wxUSE_FONTMAP
2982     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2983     {
2984         // use "false" to suppress interactive dialogs -- we can be called from
2985         // anywhere and popping up a dialog from here is the last thing we want to
2986         // do
2987         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2988     }
2989 #endif // wxUSE_FONTMAP
2990
2991     switch ( enc )
2992     {
2993         case wxFONTENCODING_UTF7:
2994              return new wxMBConvUTF7;
2995
2996         case wxFONTENCODING_UTF8:
2997              return new wxMBConvUTF8;
2998
2999         case wxFONTENCODING_UTF16BE:
3000              return new wxMBConvUTF16BE;
3001
3002         case wxFONTENCODING_UTF16LE:
3003              return new wxMBConvUTF16LE;
3004
3005         case wxFONTENCODING_UTF32BE:
3006              return new wxMBConvUTF32BE;
3007
3008         case wxFONTENCODING_UTF32LE:
3009              return new wxMBConvUTF32LE;
3010
3011         default:
3012              // nothing to do but put here to suppress gcc warnings
3013              ;
3014     }
3015
3016     // step (3)
3017 #if wxUSE_FONTMAP
3018     {
3019         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3020                                       : new wxMBConv_wxwin(m_encoding);
3021         if ( conv->IsOk() )
3022             return conv;
3023
3024         delete conv;
3025     }
3026 #endif // wxUSE_FONTMAP
3027
3028     // NB: This is a hack to prevent deadlock. What could otherwise happen
3029     //     in Unicode build: wxConvLocal creation ends up being here
3030     //     because of some failure and logs the error. But wxLog will try to
3031     //     attach timestamp, for which it will need wxConvLocal (to convert
3032     //     time to char* and then wchar_t*), but that fails, tries to log
3033     //     error, but wxLog has a (already locked) critical section that
3034     //     guards static buffer.
3035     static bool alreadyLoggingError = false;
3036     if (!alreadyLoggingError)
3037     {
3038         alreadyLoggingError = true;
3039         wxLogError(_("Cannot convert from the charset '%s'!"),
3040                    m_name ? m_name
3041                       :
3042 #if wxUSE_FONTMAP
3043                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3044 #else // !wxUSE_FONTMAP
3045                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3046 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3047               );
3048         alreadyLoggingError = false;
3049     }
3050
3051     return NULL;
3052 }
3053
3054 void wxCSConv::CreateConvIfNeeded() const
3055 {
3056     if ( m_deferred )
3057     {
3058         wxCSConv *self = (wxCSConv *)this; // const_cast
3059
3060 #if wxUSE_INTL
3061         // if we don't have neither the name nor the encoding, use the default
3062         // encoding for this system
3063         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3064         {
3065             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3066         }
3067 #endif // wxUSE_INTL
3068
3069         self->m_convReal = DoCreate();
3070         self->m_deferred = false;
3071     }
3072 }
3073
3074 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3075 {
3076     CreateConvIfNeeded();
3077
3078     if (m_convReal)
3079         return m_convReal->MB2WC(buf, psz, n);
3080
3081     // latin-1 (direct)
3082     size_t len = strlen(psz);
3083
3084     if (buf)
3085     {
3086         for (size_t c = 0; c <= len; c++)
3087             buf[c] = (unsigned char)(psz[c]);
3088     }
3089
3090     return len;
3091 }
3092
3093 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3094 {
3095     CreateConvIfNeeded();
3096
3097     if (m_convReal)
3098         return m_convReal->WC2MB(buf, psz, n);
3099
3100     // latin-1 (direct)
3101     const size_t len = wxWcslen(psz);
3102     if (buf)
3103     {
3104         for (size_t c = 0; c <= len; c++)
3105         {
3106             if (psz[c] > 0xFF)
3107                 return (size_t)-1;
3108             buf[c] = (char)psz[c];
3109         }
3110     }
3111     else
3112     {
3113         for (size_t c = 0; c <= len; c++)
3114         {
3115             if (psz[c] > 0xFF)
3116                 return (size_t)-1;
3117         }
3118     }
3119
3120     return len;
3121 }
3122
3123 size_t wxCSConv::GetMBNulLen() const
3124 {
3125     CreateConvIfNeeded();
3126
3127     if ( m_convReal )
3128     {
3129         return m_convReal->GetMBNulLen();
3130     }
3131
3132     return 1;
3133 }
3134
3135 // ----------------------------------------------------------------------------
3136 // globals
3137 // ----------------------------------------------------------------------------
3138
3139 #ifdef __WINDOWS__
3140     static wxMBConv_win32 wxConvLibcObj;
3141 #elif defined(__WXMAC__) && !defined(__MACH__)
3142     static wxMBConv_mac wxConvLibcObj ;
3143 #else
3144     static wxMBConvLibc wxConvLibcObj;
3145 #endif
3146
3147 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3148 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3149 static wxMBConvUTF7 wxConvUTF7Obj;
3150 static wxMBConvUTF8 wxConvUTF8Obj;
3151
3152 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3153 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3154 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3155 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3156 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3157 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3158 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3159 #ifdef __WXOSX__
3160                                     wxConvUTF8Obj;
3161 #else
3162                                     wxConvLibcObj;
3163 #endif
3164
3165
3166 #else // !wxUSE_WCHAR_T
3167
3168 // stand-ins in absence of wchar_t
3169 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3170                                 wxConvISO8859_1,
3171                                 wxConvLocal,
3172                                 wxConvUTF8;
3173
3174 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T