src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // ----------------------------------------------------------------------------
  90 // UTF-16 en/decoding to/from UCS-4
  91 // ----------------------------------------------------------------------------
  92
  93
  94 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  95 {
  96     if (input<=0xffff)
  97     {
  98         if (output)
  99             *output = (wxUint16) input;
 100         return 1;
 101     }
 102     else if (input>=0x110000)
 103     {
 104         return (size_t)-1;
 105     }
 106     else
 107     {
 108         if (output)
 109         {
 110             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 111             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 112         }
 113         return 2;
 114     }
 115 }
 116
 117 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 118 {
 119     if ((*input<0xd800) || (*input>0xdfff))
 120     {
 121         output = *input;
 122         return 1;
 123     }
 124     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 125     {
 126         output = *input;
 127         return (size_t)-1;
 128     }
 129     else
 130     {
 131         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 132         return 2;
 133     }
 134 }
 135
 136
 137 // ----------------------------------------------------------------------------
 138 // wxMBConv
 139 // ----------------------------------------------------------------------------
 140
 141 wxMBConv::~wxMBConv()
 142 {
 143     // nothing to do here (necessary for Darwin linking probably)
 144 }
 145
 146 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 147 {
 148     if ( psz )
 149     {
 150         // calculate the length of the buffer needed first
 151         size_t nLen = MB2WC(NULL, psz, 0);
 152         if ( nLen != (size_t)-1 )
 153         {
 154             // now do the actual conversion
 155             wxWCharBuffer buf(nLen);
 156             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 157             if ( nLen != (size_t)-1 )
 158             {
 159                 return buf;
 160             }
 161         }
 162     }
 163
 164     wxWCharBuffer buf((wchar_t *)NULL);
 165
 166     return buf;
 167 }
 168
 169 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 170 {
 171     if ( pwz )
 172     {
 173         size_t nLen = WC2MB(NULL, pwz, 0);
 174         if ( nLen != (size_t)-1 )
 175         {
 176             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 177             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 178             if ( nLen != (size_t)-1 )
 179             {
 180                 return buf;
 181             }
 182         }
 183     }
 184
 185     wxCharBuffer buf((char *)NULL);
 186
 187     return buf;
 188 }
 189
 190 const wxWCharBuffer
 191 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 192 {
 193     // the currently accumulated wide characters
 194     wxWCharBuffer wbuf;
 195
 196     // the current length of wbuf
 197     size_t lenBuf = 0;
 198
 199     // we need to know the representation of L'\0' for this conversion
 200     size_t nulLen;
 201     const char * const nul = GetMBNul(&nulLen);
 202     if ( nulLen == (size_t)-1 || nulLen == 0 )
 203         return wxWCharBuffer();
 204
 205     // make a copy of the input string unless it is already properly
 206     // NUL-terminated
 207     wxCharBuffer bufTmp;
 208
 209     // now we can compute the input size if we were not given it: notice that
 210     // in this case the string must be properly NUL-terminated, of course, as
 211     // otherwise we have no way of knowing how long it is
 212     if ( inLen == (size_t)-1 )
 213     {
 214         // not the most efficient algorithm but it shouldn't matter as normally
 215         // there are not many NULs in the string and so normally memcmp()
 216         // should stop on the first character
 217         for ( const char *p = in; ; p++ )
 218         {
 219             if ( memcmp(p, nul, nulLen) == 0 )
 220                 break;
 221         }
 222
 223         inLen = p - in + nulLen;
 224     }
 225     else // we already have the size
 226     {
 227         // check if it's not already NUL-terminated too to avoid the copy
 228         if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 )
 229         {
 230             // make a copy in order to properly NUL-terminate the string
 231             bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
 232             memcpy(bufTmp.data(), in, inLen);
 233             memcpy(bufTmp.data() + inLen, nul, nulLen);
 234         }
 235     }
 236
 237     if ( bufTmp )
 238         in = bufTmp;
 239
 240     for ( const char * const inEnd = in + inLen;; )
 241     {
 242         // try to convert the current chunk if anything left
 243         size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0;
 244         if ( lenChunk == 0 )
 245         {
 246             // nothing left in the input string, conversion succeeded
 247             if ( outLen )
 248             {
 249                 // we shouldn't include the last NUL in the result length
 250                 *outLen = lenBuf ? lenBuf - 1 : 0;
 251             }
 252
 253             return wbuf;
 254         }
 255
 256         if ( lenChunk == (size_t)-1 )
 257             break;
 258
 259         const size_t lenBufNew = lenBuf + lenChunk;
 260         if ( !wbuf.extend(lenBufNew) )
 261             break;
 262
 263         lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 264         if ( lenChunk == (size_t)-1 )
 265             break;
 266
 267         // +! for the embedded NUL (if something follows)
 268         lenBuf = lenBufNew + 1;
 269
 270         // advance the input pointer past the end of this chunk
 271         while ( memcmp(in, nul, nulLen) != 0 )
 272             in++;
 273
 274         in += nulLen; // skipping over its terminator as well
 275     }
 276
 277     // conversion failed
 278     if ( outLen )
 279         *outLen = 0;
 280
 281     return wxWCharBuffer();
 282 }
 283
 284 const wxCharBuffer
 285 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 286 {
 287     // the currently accumulated multibyte characters
 288     wxCharBuffer buf;
 289
 290     // the current length of buf
 291     size_t lenBuf = 0;
 292
 293     // make a copy of the input string unless it is already properly
 294     // NUL-terminated
 295     //
 296     // if we don't know its length we have no choice but to assume that it is,
 297     // indeed, properly terminated
 298     wxWCharBuffer bufTmp;
 299     if ( inLen == (size_t)-1 )
 300     {
 301         inLen = wxWcslen(in) + 1;
 302     }
 303     else if ( inLen != 0 && in[inLen - 1] != L'\0' )
 304     {
 305         // make a copy in order to properly NUL-terminate the string
 306         bufTmp = wxWCharBuffer(inLen);
 307         memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t));
 308     }
 309
 310     if ( bufTmp )
 311         in = bufTmp;
 312
 313     for ( const wchar_t * const inEnd = in + inLen;; )
 314     {
 315         // try to convert the current chunk, if anything left
 316         size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0;
 317         if ( lenChunk == 0 )
 318         {
 319             // nothing left in the input string, conversion succeeded
 320             if ( outLen )
 321                 *outLen = lenBuf ? lenBuf - 1 : lenBuf;
 322
 323             return buf;
 324         }
 325
 326         if ( lenChunk == (size_t)-1 )
 327             break;
 328
 329         const size_t lenBufNew = lenBuf + lenChunk;
 330         if ( !buf.extend(lenBufNew) )
 331             break;
 332
 333         lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 334         if ( lenChunk == (size_t)-1 )
 335             break;
 336
 337         // chunk successfully converted, go to the next one
 338         in += wxWcslen(in) + 1 /* skip NUL too */;
 339         lenBuf = lenBufNew + 1;
 340     }
 341
 342     // conversion failed
 343     if ( outLen )
 344         *outLen = 0;
 345
 346     return wxCharBuffer();
 347 }
 348
 349 // ----------------------------------------------------------------------------
 350 // wxMBConvLibc
 351 // ----------------------------------------------------------------------------
 352
 353 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 354 {
 355     return wxMB2WC(buf, psz, n);
 356 }
 357
 358 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 359 {
 360     return wxWC2MB(buf, psz, n);
 361 }
 362
 363 // ----------------------------------------------------------------------------
 364 // wxConvBrokenFileNames
 365 // ----------------------------------------------------------------------------
 366
 367 #ifdef __UNIX__
 368
 369 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 370 {
 371     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 372                   || wxStricmp(charset, _T("UTF8")) == 0  )
 373         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 374     else
 375         m_conv = new wxCSConv(charset);
 376 }
 377
 378 #endif // __UNIX__
 379
 380 // ----------------------------------------------------------------------------
 381 // UTF-7
 382 // ----------------------------------------------------------------------------
 383
 384 // Implementation (C) 2004 Fredrik Roubert
 385
 386 //
 387 // BASE64 decoding table
 388 //
 389 static const unsigned char utf7unb64[] =
 390 {
 391     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 392     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 393     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 394     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 395     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 396     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 397     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 398     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 399     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 400     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 401     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 402     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 403     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 404     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 405     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 406     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 407     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 408     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 409     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 410     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 411     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 412     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 413     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 414     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 415     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 416     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 417     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 418     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 419     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 420     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 421     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 422     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 423 };
 424
 425 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 426 {
 427     size_t len = 0;
 428
 429     while ( *psz && (!buf || (len < n)) )
 430     {
 431         unsigned char cc = *psz++;
 432         if (cc != '+')
 433         {
 434             // plain ASCII char
 435             if (buf)
 436                 *buf++ = cc;
 437             len++;
 438         }
 439         else if (*psz == '-')
 440         {
 441             // encoded plus sign
 442             if (buf)
 443                 *buf++ = cc;
 444             len++;
 445             psz++;
 446         }
 447         else // start of BASE64 encoded string
 448         {
 449             bool lsb, ok;
 450             unsigned int d, l;
 451             for ( ok = lsb = false, d = 0, l = 0;
 452                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 453                   psz++ )
 454             {
 455                 d <<= 6;
 456                 d += cc;
 457                 for (l += 6; l >= 8; lsb = !lsb)
 458                 {
 459                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 460                     if (lsb)
 461                     {
 462                         if (buf)
 463                             *buf++ |= c;
 464                         len ++;
 465                     }
 466                     else
 467                     {
 468                         if (buf)
 469                             *buf = (wchar_t)(c << 8);
 470                     }
 471
 472                     ok = true;
 473                 }
 474             }
 475
 476             if ( !ok )
 477             {
 478                 // in valid UTF7 we should have valid characters after '+'
 479                 return (size_t)-1;
 480             }
 481
 482             if (*psz == '-')
 483                 psz++;
 484         }
 485     }
 486
 487     if ( buf && (len < n) )
 488         *buf = '\0';
 489
 490     return len;
 491 }
 492
 493 //
 494 // BASE64 encoding table
 495 //
 496 static const unsigned char utf7enb64[] =
 497 {
 498     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 499     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 500     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 501     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 502     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 503     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 504     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 505     '4', '5', '6', '7', '8', '9', '+', '/'
 506 };
 507
 508 //
 509 // UTF-7 encoding table
 510 //
 511 // 0 - Set D (directly encoded characters)
 512 // 1 - Set O (optional direct characters)
 513 // 2 - whitespace characters (optional)
 514 // 3 - special characters
 515 //
 516 static const unsigned char utf7encode[128] =
 517 {
 518     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 519     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 520     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 521     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 522     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 523     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 524     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 525     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 526 };
 527
 528 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 529 {
 530     size_t len = 0;
 531
 532     while (*psz && ((!buf) || (len < n)))
 533     {
 534         wchar_t cc = *psz++;
 535         if (cc < 0x80 && utf7encode[cc] < 1)
 536         {
 537             // plain ASCII char
 538             if (buf)
 539                 *buf++ = (char)cc;
 540             len++;
 541         }
 542 #ifndef WC_UTF16
 543         else if (((wxUint32)cc) > 0xffff)
 544         {
 545             // no surrogate pair generation (yet?)
 546             return (size_t)-1;
 547         }
 548 #endif
 549         else
 550         {
 551             if (buf)
 552                 *buf++ = '+';
 553             len++;
 554             if (cc != '+')
 555             {
 556                 // BASE64 encode string
 557                 unsigned int lsb, d, l;
 558                 for (d = 0, l = 0; /*nothing*/; psz++)
 559                 {
 560                     for (lsb = 0; lsb < 2; lsb ++)
 561                     {
 562                         d <<= 8;
 563                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 564
 565                         for (l += 8; l >= 6; )
 566                         {
 567                             l -= 6;
 568                             if (buf)
 569                                 *buf++ = utf7enb64[(d >> l) % 64];
 570                             len++;
 571                         }
 572                     }
 573                     cc = *psz;
 574                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 575                         break;
 576                 }
 577                 if (l != 0)
 578                 {
 579                     if (buf)
 580                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 581                     len++;
 582                 }
 583             }
 584             if (buf)
 585                 *buf++ = '-';
 586             len++;
 587         }
 588     }
 589     if (buf && (len < n))
 590         *buf = 0;
 591     return len;
 592 }
 593
 594 // ----------------------------------------------------------------------------
 595 // UTF-8
 596 // ----------------------------------------------------------------------------
 597
 598 static wxUint32 utf8_max[]=
 599     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 600
 601 // boundaries of the private use area we use to (temporarily) remap invalid
 602 // characters invalid in a UTF-8 encoded string
 603 const wxUint32 wxUnicodePUA = 0x100000;
 604 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 605
 606 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 607 {
 608     size_t len = 0;
 609
 610     while (*psz && ((!buf) || (len < n)))
 611     {
 612         const char *opsz = psz;
 613         bool invalid = false;
 614         unsigned char cc = *psz++, fc = cc;
 615         unsigned cnt;
 616         for (cnt = 0; fc & 0x80; cnt++)
 617             fc <<= 1;
 618         if (!cnt)
 619         {
 620             // plain ASCII char
 621             if (buf)
 622                 *buf++ = cc;
 623             len++;
 624
 625             // escape the escape character for octal escapes
 626             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 627                     && cc == '\\' && (!buf || len < n))
 628             {
 629                 if (buf)
 630                     *buf++ = cc;
 631                 len++;
 632             }
 633         }
 634         else
 635         {
 636             cnt--;
 637             if (!cnt)
 638             {
 639                 // invalid UTF-8 sequence
 640                 invalid = true;
 641             }
 642             else
 643             {
 644                 unsigned ocnt = cnt - 1;
 645                 wxUint32 res = cc & (0x3f >> cnt);
 646                 while (cnt--)
 647                 {
 648                     cc = *psz;
 649                     if ((cc & 0xC0) != 0x80)
 650                     {
 651                         // invalid UTF-8 sequence
 652                         invalid = true;
 653                         break;
 654                     }
 655                     psz++;
 656                     res = (res << 6) | (cc & 0x3f);
 657                 }
 658                 if (invalid || res <= utf8_max[ocnt])
 659                 {
 660                     // illegal UTF-8 encoding
 661                     invalid = true;
 662                 }
 663                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 664                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 665                 {
 666                     // if one of our PUA characters turns up externally
 667                     // it must also be treated as an illegal sequence
 668                     // (a bit like you have to escape an escape character)
 669                     invalid = true;
 670                 }
 671                 else
 672                 {
 673 #ifdef WC_UTF16
 674                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 675                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 676                     if (pa == (size_t)-1)
 677                     {
 678                         invalid = true;
 679                     }
 680                     else
 681                     {
 682                         if (buf)
 683                             buf += pa;
 684                         len += pa;
 685                     }
 686 #else // !WC_UTF16
 687                     if (buf)
 688                         *buf++ = (wchar_t)res;
 689                     len++;
 690 #endif // WC_UTF16/!WC_UTF16
 691                 }
 692             }
 693             if (invalid)
 694             {
 695                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 696                 {
 697                     while (opsz < psz && (!buf || len < n))
 698                     {
 699 #ifdef WC_UTF16
 700                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 701                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 702                         wxASSERT(pa != (size_t)-1);
 703                         if (buf)
 704                             buf += pa;
 705                         opsz++;
 706                         len += pa;
 707 #else
 708                         if (buf)
 709                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 710                         opsz++;
 711                         len++;
 712 #endif
 713                     }
 714                 }
 715                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 716                 {
 717                     while (opsz < psz && (!buf || len < n))
 718                     {
 719                         if ( buf && len + 3 < n )
 720                         {
 721                             unsigned char on = *opsz;
 722                             *buf++ = L'\\';
 723                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 724                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 725                             *buf++ = (wchar_t)( L'0' + on % 010 );
 726                         }
 727                         opsz++;
 728                         len += 4;
 729                     }
 730                 }
 731                 else // MAP_INVALID_UTF8_NOT
 732                 {
 733                     return (size_t)-1;
 734                 }
 735             }
 736         }
 737     }
 738     if (buf && (len < n))
 739         *buf = 0;
 740     return len;
 741 }
 742
 743 static inline bool isoctal(wchar_t wch)
 744 {
 745     return L'0' <= wch && wch <= L'7';
 746 }
 747
 748 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 749 {
 750     size_t len = 0;
 751
 752     while (*psz && ((!buf) || (len < n)))
 753     {
 754         wxUint32 cc;
 755 #ifdef WC_UTF16
 756         // cast is ok for WC_UTF16
 757         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 758         psz += (pa == (size_t)-1) ? 1 : pa;
 759 #else
 760         cc=(*psz++) & 0x7fffffff;
 761 #endif
 762
 763         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 764                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 765         {
 766             if (buf)
 767                 *buf++ = (char)(cc - wxUnicodePUA);
 768             len++;
 769         }
 770         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 771                     && cc == L'\\' && psz[0] == L'\\' )
 772         {
 773             if (buf)
 774                 *buf++ = (char)cc;
 775             psz++;
 776             len++;
 777         }
 778         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 779                     cc == L'\\' &&
 780                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 781         {
 782             if (buf)
 783             {
 784                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 785                                  (psz[1] - L'0')*010 +
 786                                  (psz[2] - L'0'));
 787             }
 788
 789             psz += 3;
 790             len++;
 791         }
 792         else
 793         {
 794             unsigned cnt;
 795             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 796             if (!cnt)
 797             {
 798                 // plain ASCII char
 799                 if (buf)
 800                     *buf++ = (char) cc;
 801                 len++;
 802             }
 803
 804             else
 805             {
 806                 len += cnt + 1;
 807                 if (buf)
 808                 {
 809                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 810                     while (cnt--)
 811                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 812                 }
 813             }
 814         }
 815     }
 816
 817     if (buf && (len<n))
 818         *buf = 0;
 819
 820     return len;
 821 }
 822
 823 // ----------------------------------------------------------------------------
 824 // UTF-16
 825 // ----------------------------------------------------------------------------
 826
 827 #ifdef WORDS_BIGENDIAN
 828     #define wxMBConvUTF16straight wxMBConvUTF16BE
 829     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 830 #else
 831     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 832     #define wxMBConvUTF16straight wxMBConvUTF16LE
 833 #endif
 834
 835
 836 #ifdef WC_UTF16
 837
 838 // copy 16bit MB to 16bit String
 839 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 840 {
 841     size_t len=0;
 842
 843     while (*(wxUint16*)psz && (!buf || len < n))
 844     {
 845         if (buf)
 846             *buf++ = *(wxUint16*)psz;
 847         len++;
 848
 849         psz += sizeof(wxUint16);
 850     }
 851     if (buf && len<n)   *buf=0;
 852
 853     return len;
 854 }
 855
 856
 857 // copy 16bit String to 16bit MB
 858 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 859 {
 860     size_t len=0;
 861
 862     while (*psz && (!buf || len < n))
 863     {
 864         if (buf)
 865         {
 866             *(wxUint16*)buf = *psz;
 867             buf += sizeof(wxUint16);
 868         }
 869         len += sizeof(wxUint16);
 870         psz++;
 871     }
 872     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 873
 874     return len;
 875 }
 876
 877
 878 // swap 16bit MB to 16bit String
 879 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 880 {
 881     size_t len = 0;
 882
 883     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 884     // inside the string
 885     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 886     {
 887         if ( buf )
 888         {
 889             ((char *)buf)[0] = psz[1];
 890             ((char *)buf)[1] = psz[0];
 891             buf++;
 892         }
 893         len++;
 894         psz += 2;
 895     }
 896
 897     if ( buf && len < n )
 898         *buf = L'\0';
 899
 900     return len;
 901 }
 902
 903
 904 // swap 16bit MB to 16bit String
 905 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 906 {
 907     size_t len = 0;
 908
 909     while ( *psz && (!buf || len < n) )
 910     {
 911         if ( buf )
 912         {
 913             *buf++ = ((char*)psz)[1];
 914             *buf++ = ((char*)psz)[0];
 915         }
 916         len += 2;
 917         psz++;
 918     }
 919
 920     if ( buf && len < n )
 921         *buf = '\0';
 922
 923     return len;
 924 }
 925
 926
 927 #else // WC_UTF16
 928
 929
 930 // copy 16bit MB to 32bit String
 931 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 932 {
 933     size_t len=0;
 934
 935     while (*(wxUint16*)psz && (!buf || len < n))
 936     {
 937         wxUint32 cc;
 938         size_t pa=decode_utf16((wxUint16*)psz, cc);
 939         if (pa == (size_t)-1)
 940             return pa;
 941
 942         if (buf)
 943             *buf++ = (wchar_t)cc;
 944         len++;
 945         psz += pa * sizeof(wxUint16);
 946     }
 947     if (buf && len<n)   *buf=0;
 948
 949     return len;
 950 }
 951
 952
 953 // copy 32bit String to 16bit MB
 954 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 955 {
 956     size_t len=0;
 957
 958     while (*psz && (!buf || len < n))
 959     {
 960         wxUint16 cc[2];
 961         size_t pa=encode_utf16(*psz, cc);
 962
 963         if (pa == (size_t)-1)
 964             return pa;
 965
 966         if (buf)
 967         {
 968             *(wxUint16*)buf = cc[0];
 969             buf += sizeof(wxUint16);
 970             if (pa > 1)
 971             {
 972                 *(wxUint16*)buf = cc[1];
 973                 buf += sizeof(wxUint16);
 974             }
 975         }
 976
 977         len += pa*sizeof(wxUint16);
 978         psz++;
 979     }
 980     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 981
 982     return len;
 983 }
 984
 985
 986 // swap 16bit MB to 32bit String
 987 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 988 {
 989     size_t len=0;
 990
 991     while (*(wxUint16*)psz && (!buf || len < n))
 992     {
 993         wxUint32 cc;
 994         char tmp[4];
 995         tmp[0]=psz[1];  tmp[1]=psz[0];
 996         tmp[2]=psz[3];  tmp[3]=psz[2];
 997
 998         size_t pa=decode_utf16((wxUint16*)tmp, cc);
 999         if (pa == (size_t)-1)
1000             return pa;
1001
1002         if (buf)
1003             *buf++ = (wchar_t)cc;
1004
1005         len++;
1006         psz += pa * sizeof(wxUint16);
1007     }
1008     if (buf && len<n)   *buf=0;
1009
1010     return len;
1011 }
1012
1013
1014 // swap 32bit String to 16bit MB
1015 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1016 {
1017     size_t len=0;
1018
1019     while (*psz && (!buf || len < n))
1020     {
1021         wxUint16 cc[2];
1022         size_t pa=encode_utf16(*psz, cc);
1023
1024         if (pa == (size_t)-1)
1025             return pa;
1026
1027         if (buf)
1028         {
1029             *buf++ = ((char*)cc)[1];
1030             *buf++ = ((char*)cc)[0];
1031             if (pa > 1)
1032             {
1033                 *buf++ = ((char*)cc)[3];
1034                 *buf++ = ((char*)cc)[2];
1035             }
1036         }
1037
1038         len += pa*sizeof(wxUint16);
1039         psz++;
1040     }
1041     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1042
1043     return len;
1044 }
1045
1046 #endif // WC_UTF16
1047
1048
1049 // ----------------------------------------------------------------------------
1050 // UTF-32
1051 // ----------------------------------------------------------------------------
1052
1053 #ifdef WORDS_BIGENDIAN
1054 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1055 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1056 #else
1057 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1058 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1059 #endif
1060
1061
1062 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1063 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1064
1065
1066 #ifdef WC_UTF16
1067
1068 // copy 32bit MB to 16bit String
1069 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1070 {
1071     size_t len=0;
1072
1073     while (*(wxUint32*)psz && (!buf || len < n))
1074     {
1075         wxUint16 cc[2];
1076
1077         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1078         if (pa == (size_t)-1)
1079             return pa;
1080
1081         if (buf)
1082         {
1083             *buf++ = cc[0];
1084             if (pa > 1)
1085                 *buf++ = cc[1];
1086         }
1087         len += pa;
1088         psz += sizeof(wxUint32);
1089     }
1090     if (buf && len<n)   *buf=0;
1091
1092     return len;
1093 }
1094
1095
1096 // copy 16bit String to 32bit MB
1097 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1098 {
1099     size_t len=0;
1100
1101     while (*psz && (!buf || len < n))
1102     {
1103         wxUint32 cc;
1104
1105         // cast is ok for WC_UTF16
1106         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1107         if (pa == (size_t)-1)
1108             return pa;
1109
1110         if (buf)
1111         {
1112             *(wxUint32*)buf = cc;
1113             buf += sizeof(wxUint32);
1114         }
1115         len += sizeof(wxUint32);
1116         psz += pa;
1117     }
1118
1119     if (buf && len<=n-sizeof(wxUint32))
1120         *(wxUint32*)buf=0;
1121
1122     return len;
1123 }
1124
1125
1126
1127 // swap 32bit MB to 16bit String
1128 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1129 {
1130     size_t len=0;
1131
1132     while (*(wxUint32*)psz && (!buf || len < n))
1133     {
1134         char tmp[4];
1135         tmp[0] = psz[3];   tmp[1] = psz[2];
1136         tmp[2] = psz[1];   tmp[3] = psz[0];
1137
1138
1139         wxUint16 cc[2];
1140
1141         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1142         if (pa == (size_t)-1)
1143             return pa;
1144
1145         if (buf)
1146         {
1147             *buf++ = cc[0];
1148             if (pa > 1)
1149                 *buf++ = cc[1];
1150         }
1151         len += pa;
1152         psz += sizeof(wxUint32);
1153     }
1154
1155     if (buf && len<n)
1156         *buf=0;
1157
1158     return len;
1159 }
1160
1161
1162 // swap 16bit String to 32bit MB
1163 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1164 {
1165     size_t len=0;
1166
1167     while (*psz && (!buf || len < n))
1168     {
1169         char cc[4];
1170
1171         // cast is ok for WC_UTF16
1172         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1173         if (pa == (size_t)-1)
1174             return pa;
1175
1176         if (buf)
1177         {
1178             *buf++ = cc[3];
1179             *buf++ = cc[2];
1180             *buf++ = cc[1];
1181             *buf++ = cc[0];
1182         }
1183         len += sizeof(wxUint32);
1184         psz += pa;
1185     }
1186
1187     if (buf && len<=n-sizeof(wxUint32))
1188         *(wxUint32*)buf=0;
1189
1190     return len;
1191 }
1192
1193 #else // WC_UTF16
1194
1195
1196 // copy 32bit MB to 32bit String
1197 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1198 {
1199     size_t len=0;
1200
1201     while (*(wxUint32*)psz && (!buf || len < n))
1202     {
1203         if (buf)
1204             *buf++ = (wchar_t)(*(wxUint32*)psz);
1205         len++;
1206         psz += sizeof(wxUint32);
1207     }
1208
1209     if (buf && len<n)
1210         *buf=0;
1211
1212     return len;
1213 }
1214
1215
1216 // copy 32bit String to 32bit MB
1217 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1218 {
1219     size_t len=0;
1220
1221     while (*psz && (!buf || len < n))
1222     {
1223         if (buf)
1224         {
1225             *(wxUint32*)buf = *psz;
1226             buf += sizeof(wxUint32);
1227         }
1228
1229         len += sizeof(wxUint32);
1230         psz++;
1231     }
1232
1233     if (buf && len<=n-sizeof(wxUint32))
1234         *(wxUint32*)buf=0;
1235
1236     return len;
1237 }
1238
1239
1240 // swap 32bit MB to 32bit String
1241 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1242 {
1243     size_t len=0;
1244
1245     while (*(wxUint32*)psz && (!buf || len < n))
1246     {
1247         if (buf)
1248         {
1249             ((char *)buf)[0] = psz[3];
1250             ((char *)buf)[1] = psz[2];
1251             ((char *)buf)[2] = psz[1];
1252             ((char *)buf)[3] = psz[0];
1253             buf++;
1254         }
1255         len++;
1256         psz += sizeof(wxUint32);
1257     }
1258
1259     if (buf && len<n)
1260         *buf=0;
1261
1262     return len;
1263 }
1264
1265
1266 // swap 32bit String to 32bit MB
1267 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1268 {
1269     size_t len=0;
1270
1271     while (*psz && (!buf || len < n))
1272     {
1273         if (buf)
1274         {
1275             *buf++ = ((char *)psz)[3];
1276             *buf++ = ((char *)psz)[2];
1277             *buf++ = ((char *)psz)[1];
1278             *buf++ = ((char *)psz)[0];
1279         }
1280         len += sizeof(wxUint32);
1281         psz++;
1282     }
1283
1284     if (buf && len<=n-sizeof(wxUint32))
1285         *(wxUint32*)buf=0;
1286
1287     return len;
1288 }
1289
1290
1291 #endif // WC_UTF16
1292
1293
1294 // ============================================================================
1295 // The classes doing conversion using the iconv_xxx() functions
1296 // ============================================================================
1297
1298 #ifdef HAVE_ICONV
1299
1300 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1301 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1302 //     (unless there's yet another bug in glibc) the only case when iconv()
1303 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1304 //     left in the input buffer -- when _real_ error occurs,
1305 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1306 //     iconv() failure.
1307 //     [This bug does not appear in glibc 2.2.]
1308 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1309 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1310                                      (errno != E2BIG || bufLeft != 0))
1311 #else
1312 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1313 #endif
1314
1315 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1316
1317 #define ICONV_T_INVALID ((iconv_t)-1)
1318
1319 #if SIZEOF_WCHAR_T == 4
1320     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1321     #define WC_ENC      wxFONTENCODING_UTF32
1322 #elif SIZEOF_WCHAR_T == 2
1323     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1324     #define WC_ENC      wxFONTENCODING_UTF16
1325 #else // sizeof(wchar_t) != 2 nor 4
1326     // does this ever happen?
1327     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1328 #endif
1329
1330 // ----------------------------------------------------------------------------
1331 // wxMBConv_iconv: encapsulates an iconv character set
1332 // ----------------------------------------------------------------------------
1333
1334 class wxMBConv_iconv : public wxMBConv
1335 {
1336 public:
1337     wxMBConv_iconv(const wxChar *name);
1338     virtual ~wxMBConv_iconv();
1339
1340     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1341     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1342
1343     bool IsOk() const
1344         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1345
1346 protected:
1347     // the iconv handlers used to translate from multibyte to wide char and in
1348     // the other direction
1349     iconv_t m2w,
1350             w2m;
1351 #if wxUSE_THREADS
1352     // guards access to m2w and w2m objects
1353     wxMutex m_iconvMutex;
1354 #endif
1355
1356 private:
1357     virtual const char *GetMBNul(size_t *nulLen) const;
1358
1359     // the name (for iconv_open()) of a wide char charset -- if none is
1360     // available on this machine, it will remain NULL
1361     static wxString ms_wcCharsetName;
1362
1363     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1364     // different endian-ness than the native one
1365     static bool ms_wcNeedsSwap;
1366
1367     // NUL representation
1368     size_t m_nulLen;
1369     char m_nulBuf[8];
1370 };
1371
1372 // make the constructor available for unit testing
1373 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1374 {
1375     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1376     if ( !result->IsOk() )
1377     {
1378         delete result;
1379         return 0;
1380     }
1381     return result;
1382 }
1383
1384 wxString wxMBConv_iconv::ms_wcCharsetName;
1385 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1386
1387 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1388 {
1389     m_nulLen = (size_t)-2;
1390
1391     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1392     // names for the charsets
1393     const wxCharBuffer cname(wxString(name).ToAscii());
1394
1395     // check for charset that represents wchar_t:
1396     if ( ms_wcCharsetName.empty() )
1397     {
1398         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1399
1400 #if wxUSE_FONTMAP
1401         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1402 #else // !wxUSE_FONTMAP
1403         static const wxChar *names[] =
1404         {
1405 #if SIZEOF_WCHAR_T == 4
1406             _T("UCS-4"),
1407 #elif SIZEOF_WCHAR_T = 2
1408             _T("UCS-2"),
1409 #endif
1410             NULL
1411         };
1412 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1413
1414         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1415         {
1416             const wxString nameCS(*names);
1417
1418             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1419             wxString nameXE(nameCS);
1420             #ifdef WORDS_BIGENDIAN
1421                 nameXE += _T("BE");
1422             #else // little endian
1423                 nameXE += _T("LE");
1424             #endif
1425
1426             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1427                        nameXE.c_str());
1428
1429             m2w = iconv_open(nameXE.ToAscii(), cname);
1430             if ( m2w == ICONV_T_INVALID )
1431             {
1432                 // try charset w/o bytesex info (e.g. "UCS4")
1433                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1434                            nameCS.c_str());
1435                 m2w = iconv_open(nameCS.ToAscii(), cname);
1436
1437                 // and check for bytesex ourselves:
1438                 if ( m2w != ICONV_T_INVALID )
1439                 {
1440                     char    buf[2], *bufPtr;
1441                     wchar_t wbuf[2], *wbufPtr;
1442                     size_t  insz, outsz;
1443                     size_t  res;
1444
1445                     buf[0] = 'A';
1446                     buf[1] = 0;
1447                     wbuf[0] = 0;
1448                     insz = 2;
1449                     outsz = SIZEOF_WCHAR_T * 2;
1450                     wbufPtr = wbuf;
1451                     bufPtr = buf;
1452
1453                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1454                                 (char**)&wbufPtr, &outsz);
1455
1456                     if (ICONV_FAILED(res, insz))
1457                     {
1458                         wxLogLastError(wxT("iconv"));
1459                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1460                                    nameCS.c_str());
1461                     }
1462                     else // ok, can convert to this encoding, remember it
1463                     {
1464                         ms_wcCharsetName = nameCS;
1465                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1466                     }
1467                 }
1468             }
1469             else // use charset not requiring byte swapping
1470             {
1471                 ms_wcCharsetName = nameXE;
1472             }
1473         }
1474
1475         wxLogTrace(TRACE_STRCONV,
1476                    wxT("iconv wchar_t charset is \"%s\"%s"),
1477                    ms_wcCharsetName.empty() ? _T("<none>")
1478                                             : ms_wcCharsetName.c_str(),
1479                    ms_wcNeedsSwap ? _T(" (needs swap)")
1480                                   : _T(""));
1481     }
1482     else // we already have ms_wcCharsetName
1483     {
1484         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1485     }
1486
1487     if ( ms_wcCharsetName.empty() )
1488     {
1489         w2m = ICONV_T_INVALID;
1490     }
1491     else
1492     {
1493         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1494         if ( w2m == ICONV_T_INVALID )
1495         {
1496             wxLogTrace(TRACE_STRCONV,
1497                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1498                        ms_wcCharsetName.c_str(), cname.data());
1499         }
1500     }
1501 }
1502
1503 wxMBConv_iconv::~wxMBConv_iconv()
1504 {
1505     if ( m2w != ICONV_T_INVALID )
1506         iconv_close(m2w);
1507     if ( w2m != ICONV_T_INVALID )
1508         iconv_close(w2m);
1509 }
1510
1511 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1512 {
1513 #if wxUSE_THREADS
1514     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1515     //     Unfortunately there is a couple of global wxCSConv objects such as
1516     //     wxConvLocal that are used all over wx code, so we have to make sure
1517     //     the handle is used by at most one thread at the time. Otherwise
1518     //     only a few wx classes would be safe to use from non-main threads
1519     //     as MB<->WC conversion would fail "randomly".
1520     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1521 #endif
1522
1523     size_t inbuf = strlen(psz);
1524     size_t outbuf = n * SIZEOF_WCHAR_T;
1525     size_t res, cres;
1526     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1527     wchar_t *bufPtr = buf;
1528     const char *pszPtr = psz;
1529
1530     if (buf)
1531     {
1532         // have destination buffer, convert there
1533         cres = iconv(m2w,
1534                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1535                      (char**)&bufPtr, &outbuf);
1536         res = n - (outbuf / SIZEOF_WCHAR_T);
1537
1538         if (ms_wcNeedsSwap)
1539         {
1540             // convert to native endianness
1541             for ( unsigned i = 0; i < res; i++ )
1542                 buf[n] = WC_BSWAP(buf[i]);
1543         }
1544
1545         // NB: iconv was given only strlen(psz) characters on input, and so
1546         //     it couldn't convert the trailing zero. Let's do it ourselves
1547         //     if there's some room left for it in the output buffer.
1548         if (res < n)
1549             buf[res] = 0;
1550     }
1551     else
1552     {
1553         // no destination buffer... convert using temp buffer
1554         // to calculate destination buffer requirement
1555         wchar_t tbuf[8];
1556         res = 0;
1557         do {
1558             bufPtr = tbuf;
1559             outbuf = 8*SIZEOF_WCHAR_T;
1560
1561             cres = iconv(m2w,
1562                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1563                          (char**)&bufPtr, &outbuf );
1564
1565             res += 8-(outbuf/SIZEOF_WCHAR_T);
1566         } while ((cres==(size_t)-1) && (errno==E2BIG));
1567     }
1568
1569     if (ICONV_FAILED(cres, inbuf))
1570     {
1571         //VS: it is ok if iconv fails, hence trace only
1572         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1573         return (size_t)-1;
1574     }
1575
1576     return res;
1577 }
1578
1579 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1580 {
1581 #if wxUSE_THREADS
1582     // NB: explained in MB2WC
1583     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1584 #endif
1585
1586     size_t inlen = wxWcslen(psz);
1587     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1588     size_t outbuf = n;
1589     size_t res, cres;
1590
1591     wchar_t *tmpbuf = 0;
1592
1593     if (ms_wcNeedsSwap)
1594     {
1595         // need to copy to temp buffer to switch endianness
1596         // (doing WC_BSWAP twice on the original buffer won't help, as it
1597         //  could be in read-only memory, or be accessed in some other thread)
1598         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1599         for ( size_t i = 0; i < inlen; i++ )
1600             tmpbuf[n] = WC_BSWAP(psz[i]);
1601         tmpbuf[inlen] = L'\0';
1602         psz = tmpbuf;
1603     }
1604
1605     if (buf)
1606     {
1607         // have destination buffer, convert there
1608         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1609
1610         res = n-outbuf;
1611
1612         // NB: iconv was given only wcslen(psz) characters on input, and so
1613         //     it couldn't convert the trailing zero. Let's do it ourselves
1614         //     if there's some room left for it in the output buffer.
1615         if (res < n)
1616             buf[0] = 0;
1617     }
1618     else
1619     {
1620         // no destination buffer... convert using temp buffer
1621         // to calculate destination buffer requirement
1622         char tbuf[16];
1623         res = 0;
1624         do {
1625             buf = tbuf; outbuf = 16;
1626
1627             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1628
1629             res += 16 - outbuf;
1630         } while ((cres==(size_t)-1) && (errno==E2BIG));
1631     }
1632
1633     if (ms_wcNeedsSwap)
1634     {
1635         free(tmpbuf);
1636     }
1637
1638     if (ICONV_FAILED(cres, inbuf))
1639     {
1640         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1641         return (size_t)-1;
1642     }
1643
1644     return res;
1645 }
1646
1647 const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
1648 {
1649     if ( m_nulLen == (size_t)-2 )
1650     {
1651         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1652
1653 #if wxUSE_THREADS
1654         // NB: explained in MB2WC
1655         wxMutexLocker lock(self->m_iconvMutex);
1656 #endif
1657
1658         size_t inLen = 1,
1659                outLen = WXSIZEOF(m_nulBuf);
1660         self->m_nulLen = iconv(w2m, ICONV_CHAR_CAST(L""), &inLen,
1661                                &self->m_nulBuf, &outLen);
1662     }
1663
1664     *nulLen = m_nulLen;
1665     return m_nulBuf;
1666 }
1667
1668 #endif // HAVE_ICONV
1669
1670
1671 // ============================================================================
1672 // Win32 conversion classes
1673 // ============================================================================
1674
1675 #ifdef wxHAVE_WIN32_MB2WC
1676
1677 // from utils.cpp
1678 #if wxUSE_FONTMAP
1679 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1680 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1681 #endif
1682
1683 class wxMBConv_win32 : public wxMBConv
1684 {
1685 public:
1686     wxMBConv_win32()
1687     {
1688         m_CodePage = CP_ACP;
1689         m_nulLen = (size_t)-2;
1690     }
1691
1692 #if wxUSE_FONTMAP
1693     wxMBConv_win32(const wxChar* name)
1694     {
1695         m_CodePage = wxCharsetToCodepage(name);
1696         m_nulLen = (size_t)-2;
1697     }
1698
1699     wxMBConv_win32(wxFontEncoding encoding)
1700     {
1701         m_CodePage = wxEncodingToCodepage(encoding);
1702         m_nulLen = (size_t)-2;
1703     }
1704 #endif // wxUSE_FONTMAP
1705
1706     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1707     {
1708         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1709         // the behaviour is not compatible with the Unix version (using iconv)
1710         // and break the library itself, e.g. wxTextInputStream::NextChar()
1711         // wouldn't work if reading an incomplete MB char didn't result in an
1712         // error
1713         //
1714         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1715         // an error (tested under Windows Server 2003) and apparently it is
1716         // done on purpose, i.e. the function accepts any input in this case
1717         // and although I'd prefer to return error on ill-formed output, our
1718         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1719         // explicitly ill-formed according to RFC 2152) neither so we don't
1720         // even have any fallback here...
1721         //
1722         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1723         // Win XP or newer and if it is specified on older versions, conversion
1724         // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1725         // fails. So we can only use the flag on newer Windows versions.
1726         // Additionally, the flag is not supported by UTF7, symbol and CJK
1727         // encodings. See here:
1728         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1729         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1730         int flags = 0;
1731         if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1732              m_CodePage < 50000 &&
1733              IsAtLeastWin2kSP4() )
1734         {
1735             flags = MB_ERR_INVALID_CHARS;
1736         }
1737         else if ( m_CodePage == CP_UTF8 )
1738         {
1739             // Avoid round-trip in the special case of UTF-8 by using our
1740             // own UTF-8 conversion code:
1741             return wxMBConvUTF8().MB2WC(buf, psz, n);
1742         }
1743
1744         const size_t len = ::MultiByteToWideChar
1745                              (
1746                                 m_CodePage,     // code page
1747                                 flags,          // flags: fall on error
1748                                 psz,            // input string
1749                                 -1,             // its length (NUL-terminated)
1750                                 buf,            // output string
1751                                 buf ? n : 0     // size of output buffer
1752                              );
1753         if ( !len )
1754         {
1755             // function totally failed
1756             return (size_t)-1;
1757         }
1758
1759         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1760         // check if we succeeded, by doing a double trip:
1761         if ( !flags && buf )
1762         {
1763             const size_t mbLen = strlen(psz);
1764             wxCharBuffer mbBuf(mbLen);
1765             if ( ::WideCharToMultiByte
1766                    (
1767                       m_CodePage,
1768                       0,
1769                       buf,
1770                       -1,
1771                       mbBuf.data(),
1772                       mbLen + 1,        // size in bytes, not length
1773                       NULL,
1774                       NULL
1775                    ) == 0 ||
1776                   strcmp(mbBuf, psz) != 0 )
1777             {
1778                 // we didn't obtain the same thing we started from, hence
1779                 // the conversion was lossy and we consider that it failed
1780                 return (size_t)-1;
1781             }
1782         }
1783
1784         // note that it returns count of written chars for buf != NULL and size
1785         // of the needed buffer for buf == NULL so in either case the length of
1786         // the string (which never includes the terminating NUL) is one less
1787         return len - 1;
1788     }
1789
1790     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1791     {
1792         /*
1793             we have a problem here: by default, WideCharToMultiByte() may
1794             replace characters unrepresentable in the target code page with bad
1795             quality approximations such as turning "1/2" symbol (U+00BD) into
1796             "1" for the code pages which don't have it and we, obviously, want
1797             to avoid this at any price
1798
1799             the trouble is that this function does it _silently_, i.e. it won't
1800             even tell us whether it did or not... Win98/2000 and higher provide
1801             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1802             we have to resort to a round trip, i.e. check that converting back
1803             results in the same string -- this is, of course, expensive but
1804             otherwise we simply can't be sure to not garble the data.
1805          */
1806
1807         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1808         // it doesn't work with CJK encodings (which we test for rather roughly
1809         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1810         // supporting it
1811         BOOL usedDef wxDUMMY_INITIALIZE(false);
1812         BOOL *pUsedDef;
1813         int flags;
1814         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1815         {
1816             // it's our lucky day
1817             flags = WC_NO_BEST_FIT_CHARS;
1818             pUsedDef = &usedDef;
1819         }
1820         else // old system or unsupported encoding
1821         {
1822             flags = 0;
1823             pUsedDef = NULL;
1824         }
1825
1826         const size_t len = ::WideCharToMultiByte
1827                              (
1828                                 m_CodePage,     // code page
1829                                 flags,          // either none or no best fit
1830                                 pwz,            // input string
1831                                 -1,             // it is (wide) NUL-terminated
1832                                 buf,            // output buffer
1833                                 buf ? n : 0,    // and its size
1834                                 NULL,           // default "replacement" char
1835                                 pUsedDef        // [out] was it used?
1836                              );
1837
1838         if ( !len )
1839         {
1840             // function totally failed
1841             return (size_t)-1;
1842         }
1843
1844         // if we were really converting, check if we succeeded
1845         if ( buf )
1846         {
1847             if ( flags )
1848             {
1849                 // check if the conversion failed, i.e. if any replacements
1850                 // were done
1851                 if ( usedDef )
1852                     return (size_t)-1;
1853             }
1854             else // we must resort to double tripping...
1855             {
1856                 wxWCharBuffer wcBuf(n);
1857                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1858                         wcscmp(wcBuf, pwz) != 0 )
1859                 {
1860                     // we didn't obtain the same thing we started from, hence
1861                     // the conversion was lossy and we consider that it failed
1862                     return (size_t)-1;
1863                 }
1864             }
1865         }
1866
1867         // see the comment above for the reason of "len - 1"
1868         return len - 1;
1869     }
1870
1871     bool IsOk() const { return m_CodePage != -1; }
1872
1873 private:
1874     static bool CanUseNoBestFit()
1875     {
1876         static int s_isWin98Or2k = -1;
1877
1878         if ( s_isWin98Or2k == -1 )
1879         {
1880             int verMaj, verMin;
1881             switch ( wxGetOsVersion(&verMaj, &verMin) )
1882             {
1883                 case wxWIN95:
1884                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1885                     break;
1886
1887                 case wxWINDOWS_NT:
1888                     s_isWin98Or2k = verMaj >= 5;
1889                     break;
1890
1891                 default:
1892                     // unknown, be conseravtive by default
1893                     s_isWin98Or2k = 0;
1894             }
1895
1896             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1897         }
1898
1899         return s_isWin98Or2k == 1;
1900     }
1901
1902     static bool IsAtLeastWin2kSP4()
1903     {
1904 #ifdef __WXWINCE__
1905         return false;
1906 #else
1907         static int s_isAtLeastWin2kSP4 = -1;
1908
1909         if ( s_isAtLeastWin2kSP4 == -1 )
1910         {
1911             OSVERSIONINFOEX ver;
1912
1913             memset(&ver, 0, sizeof(ver));
1914             ver.dwOSVersionInfoSize = sizeof(ver);
1915             GetVersionEx((OSVERSIONINFO*)&ver);
1916
1917             s_isAtLeastWin2kSP4 =
1918               ((ver.dwMajorVersion > 5) || // Vista+
1919                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
1920                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
1921                ver.wServicePackMajor >= 4)) // 2000 SP4+
1922               ? 1 : 0;
1923         }
1924
1925         return s_isAtLeastWin2kSP4 == 1;
1926 #endif
1927     }
1928
1929     virtual const char *GetMBNul(size_t *nulLen) const
1930     {
1931         if ( m_nulLen == (size_t)-2 )
1932         {
1933             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1934
1935             self->m_nulLen = ::WideCharToMultiByte
1936                                (
1937                                     m_CodePage,         // code page
1938                                     0,                  // no flags
1939                                     L"",                // input string
1940                                     1,                  // translate just NUL
1941                                     self->m_nulBuf,     // output buffer
1942                                     WXSIZEOF(m_nulBuf), // and its size
1943                                     NULL,               // "replacement" char
1944                                     NULL                // [out] was it used?
1945                                );
1946
1947             if ( m_nulLen == 0 )
1948                 self->m_nulLen = (size_t)-1;
1949         }
1950
1951         *nulLen = m_nulLen;
1952         return m_nulBuf;
1953     }
1954
1955     long m_CodePage;
1956     size_t m_nulLen;
1957     char m_nulBuf[8];
1958 };
1959
1960 #endif // wxHAVE_WIN32_MB2WC
1961
1962 // ============================================================================
1963 // Cocoa conversion classes
1964 // ============================================================================
1965
1966 #if defined(__WXCOCOA__)
1967
1968 // RN:  There is no UTF-32 support in either Core Foundation or
1969 // Cocoa.  Strangely enough, internally Core Foundation uses
1970 // UTF 32 internally quite a bit - its just not public (yet).
1971
1972 #include <CoreFoundation/CFString.h>
1973 #include <CoreFoundation/CFStringEncodingExt.h>
1974
1975 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1976 {
1977     CFStringEncoding enc = kCFStringEncodingInvalidId ;
1978     if ( encoding == wxFONTENCODING_DEFAULT )
1979     {
1980         enc = CFStringGetSystemEncoding();
1981     }
1982     else switch( encoding)
1983     {
1984         case wxFONTENCODING_ISO8859_1 :
1985             enc = kCFStringEncodingISOLatin1 ;
1986             break ;
1987         case wxFONTENCODING_ISO8859_2 :
1988             enc = kCFStringEncodingISOLatin2;
1989             break ;
1990         case wxFONTENCODING_ISO8859_3 :
1991             enc = kCFStringEncodingISOLatin3 ;
1992             break ;
1993         case wxFONTENCODING_ISO8859_4 :
1994             enc = kCFStringEncodingISOLatin4;
1995             break ;
1996         case wxFONTENCODING_ISO8859_5 :
1997             enc = kCFStringEncodingISOLatinCyrillic;
1998             break ;
1999         case wxFONTENCODING_ISO8859_6 :
2000             enc = kCFStringEncodingISOLatinArabic;
2001             break ;
2002         case wxFONTENCODING_ISO8859_7 :
2003             enc = kCFStringEncodingISOLatinGreek;
2004             break ;
2005         case wxFONTENCODING_ISO8859_8 :
2006             enc = kCFStringEncodingISOLatinHebrew;
2007             break ;
2008         case wxFONTENCODING_ISO8859_9 :
2009             enc = kCFStringEncodingISOLatin5;
2010             break ;
2011         case wxFONTENCODING_ISO8859_10 :
2012             enc = kCFStringEncodingISOLatin6;
2013             break ;
2014         case wxFONTENCODING_ISO8859_11 :
2015             enc = kCFStringEncodingISOLatinThai;
2016             break ;
2017         case wxFONTENCODING_ISO8859_13 :
2018             enc = kCFStringEncodingISOLatin7;
2019             break ;
2020         case wxFONTENCODING_ISO8859_14 :
2021             enc = kCFStringEncodingISOLatin8;
2022             break ;
2023         case wxFONTENCODING_ISO8859_15 :
2024             enc = kCFStringEncodingISOLatin9;
2025             break ;
2026
2027         case wxFONTENCODING_KOI8 :
2028             enc = kCFStringEncodingKOI8_R;
2029             break ;
2030         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2031             enc = kCFStringEncodingDOSRussian;
2032             break ;
2033
2034 //      case wxFONTENCODING_BULGARIAN :
2035 //          enc = ;
2036 //          break ;
2037
2038         case wxFONTENCODING_CP437 :
2039             enc =kCFStringEncodingDOSLatinUS ;
2040             break ;
2041         case wxFONTENCODING_CP850 :
2042             enc = kCFStringEncodingDOSLatin1;
2043             break ;
2044         case wxFONTENCODING_CP852 :
2045             enc = kCFStringEncodingDOSLatin2;
2046             break ;
2047         case wxFONTENCODING_CP855 :
2048             enc = kCFStringEncodingDOSCyrillic;
2049             break ;
2050         case wxFONTENCODING_CP866 :
2051             enc =kCFStringEncodingDOSRussian ;
2052             break ;
2053         case wxFONTENCODING_CP874 :
2054             enc = kCFStringEncodingDOSThai;
2055             break ;
2056         case wxFONTENCODING_CP932 :
2057             enc = kCFStringEncodingDOSJapanese;
2058             break ;
2059         case wxFONTENCODING_CP936 :
2060             enc =kCFStringEncodingDOSChineseSimplif ;
2061             break ;
2062         case wxFONTENCODING_CP949 :
2063             enc = kCFStringEncodingDOSKorean;
2064             break ;
2065         case wxFONTENCODING_CP950 :
2066             enc = kCFStringEncodingDOSChineseTrad;
2067             break ;
2068         case wxFONTENCODING_CP1250 :
2069             enc = kCFStringEncodingWindowsLatin2;
2070             break ;
2071         case wxFONTENCODING_CP1251 :
2072             enc =kCFStringEncodingWindowsCyrillic ;
2073             break ;
2074         case wxFONTENCODING_CP1252 :
2075             enc =kCFStringEncodingWindowsLatin1 ;
2076             break ;
2077         case wxFONTENCODING_CP1253 :
2078             enc = kCFStringEncodingWindowsGreek;
2079             break ;
2080         case wxFONTENCODING_CP1254 :
2081             enc = kCFStringEncodingWindowsLatin5;
2082             break ;
2083         case wxFONTENCODING_CP1255 :
2084             enc =kCFStringEncodingWindowsHebrew ;
2085             break ;
2086         case wxFONTENCODING_CP1256 :
2087             enc =kCFStringEncodingWindowsArabic ;
2088             break ;
2089         case wxFONTENCODING_CP1257 :
2090             enc = kCFStringEncodingWindowsBalticRim;
2091             break ;
2092 //   This only really encodes to UTF7 (if that) evidently
2093 //        case wxFONTENCODING_UTF7 :
2094 //            enc = kCFStringEncodingNonLossyASCII ;
2095 //            break ;
2096         case wxFONTENCODING_UTF8 :
2097             enc = kCFStringEncodingUTF8 ;
2098             break ;
2099         case wxFONTENCODING_EUC_JP :
2100             enc = kCFStringEncodingEUC_JP;
2101             break ;
2102         case wxFONTENCODING_UTF16 :
2103             enc = kCFStringEncodingUnicode ;
2104             break ;
2105         case wxFONTENCODING_MACROMAN :
2106             enc = kCFStringEncodingMacRoman ;
2107             break ;
2108         case wxFONTENCODING_MACJAPANESE :
2109             enc = kCFStringEncodingMacJapanese ;
2110             break ;
2111         case wxFONTENCODING_MACCHINESETRAD :
2112             enc = kCFStringEncodingMacChineseTrad ;
2113             break ;
2114         case wxFONTENCODING_MACKOREAN :
2115             enc = kCFStringEncodingMacKorean ;
2116             break ;
2117         case wxFONTENCODING_MACARABIC :
2118             enc = kCFStringEncodingMacArabic ;
2119             break ;
2120         case wxFONTENCODING_MACHEBREW :
2121             enc = kCFStringEncodingMacHebrew ;
2122             break ;
2123         case wxFONTENCODING_MACGREEK :
2124             enc = kCFStringEncodingMacGreek ;
2125             break ;
2126         case wxFONTENCODING_MACCYRILLIC :
2127             enc = kCFStringEncodingMacCyrillic ;
2128             break ;
2129         case wxFONTENCODING_MACDEVANAGARI :
2130             enc = kCFStringEncodingMacDevanagari ;
2131             break ;
2132         case wxFONTENCODING_MACGURMUKHI :
2133             enc = kCFStringEncodingMacGurmukhi ;
2134             break ;
2135         case wxFONTENCODING_MACGUJARATI :
2136             enc = kCFStringEncodingMacGujarati ;
2137             break ;
2138         case wxFONTENCODING_MACORIYA :
2139             enc = kCFStringEncodingMacOriya ;
2140             break ;
2141         case wxFONTENCODING_MACBENGALI :
2142             enc = kCFStringEncodingMacBengali ;
2143             break ;
2144         case wxFONTENCODING_MACTAMIL :
2145             enc = kCFStringEncodingMacTamil ;
2146             break ;
2147         case wxFONTENCODING_MACTELUGU :
2148             enc = kCFStringEncodingMacTelugu ;
2149             break ;
2150         case wxFONTENCODING_MACKANNADA :
2151             enc = kCFStringEncodingMacKannada ;
2152             break ;
2153         case wxFONTENCODING_MACMALAJALAM :
2154             enc = kCFStringEncodingMacMalayalam ;
2155             break ;
2156         case wxFONTENCODING_MACSINHALESE :
2157             enc = kCFStringEncodingMacSinhalese ;
2158             break ;
2159         case wxFONTENCODING_MACBURMESE :
2160             enc = kCFStringEncodingMacBurmese ;
2161             break ;
2162         case wxFONTENCODING_MACKHMER :
2163             enc = kCFStringEncodingMacKhmer ;
2164             break ;
2165         case wxFONTENCODING_MACTHAI :
2166             enc = kCFStringEncodingMacThai ;
2167             break ;
2168         case wxFONTENCODING_MACLAOTIAN :
2169             enc = kCFStringEncodingMacLaotian ;
2170             break ;
2171         case wxFONTENCODING_MACGEORGIAN :
2172             enc = kCFStringEncodingMacGeorgian ;
2173             break ;
2174         case wxFONTENCODING_MACARMENIAN :
2175             enc = kCFStringEncodingMacArmenian ;
2176             break ;
2177         case wxFONTENCODING_MACCHINESESIMP :
2178             enc = kCFStringEncodingMacChineseSimp ;
2179             break ;
2180         case wxFONTENCODING_MACTIBETAN :
2181             enc = kCFStringEncodingMacTibetan ;
2182             break ;
2183         case wxFONTENCODING_MACMONGOLIAN :
2184             enc = kCFStringEncodingMacMongolian ;
2185             break ;
2186         case wxFONTENCODING_MACETHIOPIC :
2187             enc = kCFStringEncodingMacEthiopic ;
2188             break ;
2189         case wxFONTENCODING_MACCENTRALEUR :
2190             enc = kCFStringEncodingMacCentralEurRoman ;
2191             break ;
2192         case wxFONTENCODING_MACVIATNAMESE :
2193             enc = kCFStringEncodingMacVietnamese ;
2194             break ;
2195         case wxFONTENCODING_MACARABICEXT :
2196             enc = kCFStringEncodingMacExtArabic ;
2197             break ;
2198         case wxFONTENCODING_MACSYMBOL :
2199             enc = kCFStringEncodingMacSymbol ;
2200             break ;
2201         case wxFONTENCODING_MACDINGBATS :
2202             enc = kCFStringEncodingMacDingbats ;
2203             break ;
2204         case wxFONTENCODING_MACTURKISH :
2205             enc = kCFStringEncodingMacTurkish ;
2206             break ;
2207         case wxFONTENCODING_MACCROATIAN :
2208             enc = kCFStringEncodingMacCroatian ;
2209             break ;
2210         case wxFONTENCODING_MACICELANDIC :
2211             enc = kCFStringEncodingMacIcelandic ;
2212             break ;
2213         case wxFONTENCODING_MACROMANIAN :
2214             enc = kCFStringEncodingMacRomanian ;
2215             break ;
2216         case wxFONTENCODING_MACCELTIC :
2217             enc = kCFStringEncodingMacCeltic ;
2218             break ;
2219         case wxFONTENCODING_MACGAELIC :
2220             enc = kCFStringEncodingMacGaelic ;
2221             break ;
2222 //      case wxFONTENCODING_MACKEYBOARD :
2223 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2224 //          break ;
2225         default :
2226             // because gcc is picky
2227             break ;
2228     } ;
2229     return enc ;
2230 }
2231
2232 class wxMBConv_cocoa : public wxMBConv
2233 {
2234 public:
2235     wxMBConv_cocoa()
2236     {
2237         Init(CFStringGetSystemEncoding()) ;
2238     }
2239
2240 #if wxUSE_FONTMAP
2241     wxMBConv_cocoa(const wxChar* name)
2242     {
2243         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2244     }
2245 #endif
2246
2247     wxMBConv_cocoa(wxFontEncoding encoding)
2248     {
2249         Init( wxCFStringEncFromFontEnc(encoding) );
2250     }
2251
2252     ~wxMBConv_cocoa()
2253     {
2254     }
2255
2256     void Init( CFStringEncoding encoding)
2257     {
2258         m_encoding = encoding ;
2259     }
2260
2261     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2262     {
2263         wxASSERT(szUnConv);
2264
2265         CFStringRef theString = CFStringCreateWithBytes (
2266                                                 NULL, //the allocator
2267                                                 (const UInt8*)szUnConv,
2268                                                 strlen(szUnConv),
2269                                                 m_encoding,
2270                                                 false //no BOM/external representation
2271                                                 );
2272
2273         wxASSERT(theString);
2274
2275         size_t nOutLength = CFStringGetLength(theString);
2276
2277         if (szOut == NULL)
2278         {
2279             CFRelease(theString);
2280             return nOutLength;
2281         }
2282
2283         CFRange theRange = { 0, nOutSize };
2284
2285 #if SIZEOF_WCHAR_T == 4
2286         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2287 #endif
2288
2289         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2290
2291         CFRelease(theString);
2292
2293         szUniCharBuffer[nOutLength] = '\0' ;
2294
2295 #if SIZEOF_WCHAR_T == 4
2296         wxMBConvUTF16 converter ;
2297         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2298         delete[] szUniCharBuffer;
2299 #endif
2300
2301         return nOutLength;
2302     }
2303
2304     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2305     {
2306         wxASSERT(szUnConv);
2307
2308         size_t nRealOutSize;
2309         size_t nBufSize = wxWcslen(szUnConv);
2310         UniChar* szUniBuffer = (UniChar*) szUnConv;
2311
2312 #if SIZEOF_WCHAR_T == 4
2313         wxMBConvUTF16 converter ;
2314         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2315         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2316         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2317         nBufSize /= sizeof(UniChar);
2318 #endif
2319
2320         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2321                                 NULL, //allocator
2322                                 szUniBuffer,
2323                                 nBufSize,
2324                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2325                             );
2326
2327         wxASSERT(theString);
2328
2329         //Note that CER puts a BOM when converting to unicode
2330         //so we  check and use getchars instead in that case
2331         if (m_encoding == kCFStringEncodingUnicode)
2332         {
2333             if (szOut != NULL)
2334                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2335
2336             nRealOutSize = CFStringGetLength(theString) + 1;
2337         }
2338         else
2339         {
2340             CFStringGetBytes(
2341                 theString,
2342                 CFRangeMake(0, CFStringGetLength(theString)),
2343                 m_encoding,
2344                 0, //what to put in characters that can't be converted -
2345                     //0 tells CFString to return NULL if it meets such a character
2346                 false, //not an external representation
2347                 (UInt8*) szOut,
2348                 nOutSize,
2349                 (CFIndex*) &nRealOutSize
2350                         );
2351         }
2352
2353         CFRelease(theString);
2354
2355 #if SIZEOF_WCHAR_T == 4
2356         delete[] szUniBuffer;
2357 #endif
2358
2359         return  nRealOutSize - 1;
2360     }
2361
2362     bool IsOk() const
2363     {
2364         return m_encoding != kCFStringEncodingInvalidId &&
2365               CFStringIsEncodingAvailable(m_encoding);
2366     }
2367
2368 private:
2369     CFStringEncoding m_encoding ;
2370 };
2371
2372 #endif // defined(__WXCOCOA__)
2373
2374 // ============================================================================
2375 // Mac conversion classes
2376 // ============================================================================
2377
2378 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2379
2380 class wxMBConv_mac : public wxMBConv
2381 {
2382 public:
2383     wxMBConv_mac()
2384     {
2385         Init(CFStringGetSystemEncoding()) ;
2386     }
2387
2388 #if wxUSE_FONTMAP
2389     wxMBConv_mac(const wxChar* name)
2390     {
2391         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2392     }
2393 #endif
2394
2395     wxMBConv_mac(wxFontEncoding encoding)
2396     {
2397         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2398     }
2399
2400     ~wxMBConv_mac()
2401     {
2402         OSStatus status = noErr ;
2403         status = TECDisposeConverter(m_MB2WC_converter);
2404         status = TECDisposeConverter(m_WC2MB_converter);
2405     }
2406
2407
2408     void Init( TextEncodingBase encoding)
2409     {
2410         OSStatus status = noErr ;
2411         m_char_encoding = encoding ;
2412         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2413
2414         status = TECCreateConverter(&m_MB2WC_converter,
2415                                     m_char_encoding,
2416                                     m_unicode_encoding);
2417         status = TECCreateConverter(&m_WC2MB_converter,
2418                                     m_unicode_encoding,
2419                                     m_char_encoding);
2420     }
2421
2422     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2423     {
2424         OSStatus status = noErr ;
2425         ByteCount byteOutLen ;
2426         ByteCount byteInLen = strlen(psz) ;
2427         wchar_t *tbuf = NULL ;
2428         UniChar* ubuf = NULL ;
2429         size_t res = 0 ;
2430
2431         if (buf == NULL)
2432         {
2433             //apple specs say at least 32
2434             n = wxMax( 32 , byteInLen ) ;
2435             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2436         }
2437         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2438 #if SIZEOF_WCHAR_T == 4
2439         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2440 #else
2441         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2442 #endif
2443         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2444           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2445 #if SIZEOF_WCHAR_T == 4
2446         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2447         // is not properly terminated we get random characters at the end
2448         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2449         wxMBConvUTF16 converter ;
2450         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2451         free( ubuf ) ;
2452 #else
2453         res = byteOutLen / sizeof( UniChar ) ;
2454 #endif
2455         if ( buf == NULL )
2456              free(tbuf) ;
2457
2458         if ( buf  && res < n)
2459             buf[res] = 0;
2460
2461         return res ;
2462     }
2463
2464     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2465     {
2466         OSStatus status = noErr ;
2467         ByteCount byteOutLen ;
2468         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2469
2470         char *tbuf = NULL ;
2471
2472         if (buf == NULL)
2473         {
2474             //apple specs say at least 32
2475             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2476             tbuf = (char*) malloc( n ) ;
2477         }
2478
2479         ByteCount byteBufferLen = n ;
2480         UniChar* ubuf = NULL ;
2481 #if SIZEOF_WCHAR_T == 4
2482         wxMBConvUTF16 converter ;
2483         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2484         byteInLen = unicharlen ;
2485         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2486         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2487 #else
2488         ubuf = (UniChar*) psz ;
2489 #endif
2490         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2491             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2492 #if SIZEOF_WCHAR_T == 4
2493         free( ubuf ) ;
2494 #endif
2495         if ( buf == NULL )
2496             free(tbuf) ;
2497
2498         size_t res = byteOutLen ;
2499         if ( buf  && res < n)
2500         {
2501             buf[res] = 0;
2502
2503             //we need to double-trip to verify it didn't insert any ? in place
2504             //of bogus characters
2505             wxWCharBuffer wcBuf(n);
2506             size_t pszlen = wxWcslen(psz);
2507             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2508                         wxWcslen(wcBuf) != pszlen ||
2509                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2510             {
2511                 // we didn't obtain the same thing we started from, hence
2512                 // the conversion was lossy and we consider that it failed
2513                 return (size_t)-1;
2514             }
2515         }
2516
2517         return res ;
2518     }
2519
2520     bool IsOk() const
2521         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2522
2523 private:
2524     TECObjectRef m_MB2WC_converter ;
2525     TECObjectRef m_WC2MB_converter ;
2526
2527     TextEncodingBase m_char_encoding ;
2528     TextEncodingBase m_unicode_encoding ;
2529 };
2530
2531 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2532
2533 // ============================================================================
2534 // wxEncodingConverter based conversion classes
2535 // ============================================================================
2536
2537 #if wxUSE_FONTMAP
2538
2539 class wxMBConv_wxwin : public wxMBConv
2540 {
2541 private:
2542     void Init()
2543     {
2544         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2545                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2546     }
2547
2548 public:
2549     // temporarily just use wxEncodingConverter stuff,
2550     // so that it works while a better implementation is built
2551     wxMBConv_wxwin(const wxChar* name)
2552     {
2553         if (name)
2554             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2555         else
2556             m_enc = wxFONTENCODING_SYSTEM;
2557
2558         Init();
2559     }
2560
2561     wxMBConv_wxwin(wxFontEncoding enc)
2562     {
2563         m_enc = enc;
2564
2565         Init();
2566     }
2567
2568     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2569     {
2570         size_t inbuf = strlen(psz);
2571         if (buf)
2572         {
2573             if (!m2w.Convert(psz,buf))
2574                 return (size_t)-1;
2575         }
2576         return inbuf;
2577     }
2578
2579     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2580     {
2581         const size_t inbuf = wxWcslen(psz);
2582         if (buf)
2583         {
2584             if (!w2m.Convert(psz,buf))
2585                 return (size_t)-1;
2586         }
2587
2588         return inbuf;
2589     }
2590
2591     bool IsOk() const { return m_ok; }
2592
2593 public:
2594     wxFontEncoding m_enc;
2595     wxEncodingConverter m2w, w2m;
2596
2597 private:
2598     virtual const char *GetMBNul(size_t *nulLen) const
2599     {
2600         switch ( m_enc )
2601         {
2602             case wxFONTENCODING_UTF16BE:
2603             case wxFONTENCODING_UTF16LE:
2604                 *nulLen = 2;
2605                 return "\0";
2606
2607             case wxFONTENCODING_UTF32BE:
2608             case wxFONTENCODING_UTF32LE:
2609                 *nulLen = 4;
2610                 return "\0\0\0";
2611
2612             default:
2613                 *nulLen = 1;
2614                 return "";
2615         }
2616     }
2617
2618     // were we initialized successfully?
2619     bool m_ok;
2620
2621     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2622 };
2623
2624 // make the constructors available for unit testing
2625 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2626 {
2627     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2628     if ( !result->IsOk() )
2629     {
2630         delete result;
2631         return 0;
2632     }
2633     return result;
2634 }
2635
2636 #endif // wxUSE_FONTMAP
2637
2638 // ============================================================================
2639 // wxCSConv implementation
2640 // ============================================================================
2641
2642 void wxCSConv::Init()
2643 {
2644     m_name = NULL;
2645     m_convReal =  NULL;
2646     m_deferred = true;
2647 }
2648
2649 wxCSConv::wxCSConv(const wxChar *charset)
2650 {
2651     Init();
2652
2653     if ( charset )
2654     {
2655         SetName(charset);
2656     }
2657
2658 #if wxUSE_FONTMAP
2659     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2660 #else
2661     m_encoding = wxFONTENCODING_SYSTEM;
2662 #endif
2663 }
2664
2665 wxCSConv::wxCSConv(wxFontEncoding encoding)
2666 {
2667     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2668     {
2669         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2670
2671         encoding = wxFONTENCODING_SYSTEM;
2672     }
2673
2674     Init();
2675
2676     m_encoding = encoding;
2677 }
2678
2679 wxCSConv::~wxCSConv()
2680 {
2681     Clear();
2682 }
2683
2684 wxCSConv::wxCSConv(const wxCSConv& conv)
2685         : wxMBConv()
2686 {
2687     Init();
2688
2689     SetName(conv.m_name);
2690     m_encoding = conv.m_encoding;
2691 }
2692
2693 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2694 {
2695     Clear();
2696
2697     SetName(conv.m_name);
2698     m_encoding = conv.m_encoding;
2699
2700     return *this;
2701 }
2702
2703 void wxCSConv::Clear()
2704 {
2705     free(m_name);
2706     delete m_convReal;
2707
2708     m_name = NULL;
2709     m_convReal = NULL;
2710 }
2711
2712 void wxCSConv::SetName(const wxChar *charset)
2713 {
2714     if (charset)
2715     {
2716         m_name = wxStrdup(charset);
2717         m_deferred = true;
2718     }
2719 }
2720
2721 #if wxUSE_FONTMAP
2722 #include "wx/hashmap.h"
2723
2724 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2725                      wxEncodingNameCache );
2726
2727 static wxEncodingNameCache gs_nameCache;
2728 #endif
2729
2730 wxMBConv *wxCSConv::DoCreate() const
2731 {
2732 #if wxUSE_FONTMAP
2733     wxLogTrace(TRACE_STRCONV,
2734                wxT("creating conversion for %s"),
2735                (m_name ? m_name
2736                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2737 #endif // wxUSE_FONTMAP
2738
2739     // check for the special case of ASCII or ISO8859-1 charset: as we have
2740     // special knowledge of it anyhow, we don't need to create a special
2741     // conversion object
2742     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2743             m_encoding == wxFONTENCODING_DEFAULT )
2744     {
2745         // don't convert at all
2746         return NULL;
2747     }
2748
2749     // we trust OS to do conversion better than we can so try external
2750     // conversion methods first
2751     //
2752     // the full order is:
2753     //      1. OS conversion (iconv() under Unix or Win32 API)
2754     //      2. hard coded conversions for UTF
2755     //      3. wxEncodingConverter as fall back
2756
2757     // step (1)
2758 #ifdef HAVE_ICONV
2759 #if !wxUSE_FONTMAP
2760     if ( m_name )
2761 #endif // !wxUSE_FONTMAP
2762     {
2763         wxString name(m_name);
2764         wxFontEncoding encoding(m_encoding);
2765
2766         if ( !name.empty() )
2767         {
2768             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2769             if ( conv->IsOk() )
2770                 return conv;
2771
2772             delete conv;
2773
2774 #if wxUSE_FONTMAP
2775             encoding =
2776                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2777 #endif // wxUSE_FONTMAP
2778         }
2779 #if wxUSE_FONTMAP
2780         {
2781             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2782             if ( it != gs_nameCache.end() )
2783             {
2784                 if ( it->second.empty() )
2785                     return NULL;
2786
2787                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2788                 if ( conv->IsOk() )
2789                     return conv;
2790
2791                 delete conv;
2792             }
2793
2794             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2795
2796             for ( ; *names; ++names )
2797             {
2798                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2799                 if ( conv->IsOk() )
2800                 {
2801                     gs_nameCache[encoding] = *names;
2802                     return conv;
2803                 }
2804
2805                 delete conv;
2806             }
2807
2808             gs_nameCache[encoding] = _T(""); // cache the failure
2809         }
2810 #endif // wxUSE_FONTMAP
2811     }
2812 #endif // HAVE_ICONV
2813
2814 #ifdef wxHAVE_WIN32_MB2WC
2815     {
2816 #if wxUSE_FONTMAP
2817         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2818                                       : new wxMBConv_win32(m_encoding);
2819         if ( conv->IsOk() )
2820             return conv;
2821
2822         delete conv;
2823 #else
2824         return NULL;
2825 #endif
2826     }
2827 #endif // wxHAVE_WIN32_MB2WC
2828 #if defined(__WXMAC__)
2829     {
2830         // leave UTF16 and UTF32 to the built-ins of wx
2831         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2832             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2833         {
2834
2835 #if wxUSE_FONTMAP
2836             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2837                                         : new wxMBConv_mac(m_encoding);
2838 #else
2839             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2840 #endif
2841             if ( conv->IsOk() )
2842                  return conv;
2843
2844             delete conv;
2845         }
2846     }
2847 #endif
2848 #if defined(__WXCOCOA__)
2849     {
2850         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2851         {
2852
2853 #if wxUSE_FONTMAP
2854             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2855                                           : new wxMBConv_cocoa(m_encoding);
2856 #else
2857             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2858 #endif
2859             if ( conv->IsOk() )
2860                  return conv;
2861
2862             delete conv;
2863         }
2864     }
2865 #endif
2866     // step (2)
2867     wxFontEncoding enc = m_encoding;
2868 #if wxUSE_FONTMAP
2869     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2870     {
2871         // use "false" to suppress interactive dialogs -- we can be called from
2872         // anywhere and popping up a dialog from here is the last thing we want to
2873         // do
2874         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2875     }
2876 #endif // wxUSE_FONTMAP
2877
2878     switch ( enc )
2879     {
2880         case wxFONTENCODING_UTF7:
2881              return new wxMBConvUTF7;
2882
2883         case wxFONTENCODING_UTF8:
2884              return new wxMBConvUTF8;
2885
2886         case wxFONTENCODING_UTF16BE:
2887              return new wxMBConvUTF16BE;
2888
2889         case wxFONTENCODING_UTF16LE:
2890              return new wxMBConvUTF16LE;
2891
2892         case wxFONTENCODING_UTF32BE:
2893              return new wxMBConvUTF32BE;
2894
2895         case wxFONTENCODING_UTF32LE:
2896              return new wxMBConvUTF32LE;
2897
2898         default:
2899              // nothing to do but put here to suppress gcc warnings
2900              ;
2901     }
2902
2903     // step (3)
2904 #if wxUSE_FONTMAP
2905     {
2906         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2907                                       : new wxMBConv_wxwin(m_encoding);
2908         if ( conv->IsOk() )
2909             return conv;
2910
2911         delete conv;
2912     }
2913 #endif // wxUSE_FONTMAP
2914
2915     // NB: This is a hack to prevent deadlock. What could otherwise happen
2916     //     in Unicode build: wxConvLocal creation ends up being here
2917     //     because of some failure and logs the error. But wxLog will try to
2918     //     attach timestamp, for which it will need wxConvLocal (to convert
2919     //     time to char* and then wchar_t*), but that fails, tries to log
2920     //     error, but wxLog has a (already locked) critical section that
2921     //     guards static buffer.
2922     static bool alreadyLoggingError = false;
2923     if (!alreadyLoggingError)
2924     {
2925         alreadyLoggingError = true;
2926         wxLogError(_("Cannot convert from the charset '%s'!"),
2927                    m_name ? m_name
2928                       :
2929 #if wxUSE_FONTMAP
2930                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2931 #else // !wxUSE_FONTMAP
2932                          wxString::Format(_("encoding %s"), m_encoding).c_str()
2933 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2934               );
2935         alreadyLoggingError = false;
2936     }
2937
2938     return NULL;
2939 }
2940
2941 void wxCSConv::CreateConvIfNeeded() const
2942 {
2943     if ( m_deferred )
2944     {
2945         wxCSConv *self = (wxCSConv *)this; // const_cast
2946
2947 #if wxUSE_INTL
2948         // if we don't have neither the name nor the encoding, use the default
2949         // encoding for this system
2950         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2951         {
2952             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2953         }
2954 #endif // wxUSE_INTL
2955
2956         self->m_convReal = DoCreate();
2957         self->m_deferred = false;
2958     }
2959 }
2960
2961 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2962 {
2963     CreateConvIfNeeded();
2964
2965     if (m_convReal)
2966         return m_convReal->MB2WC(buf, psz, n);
2967
2968     // latin-1 (direct)
2969     size_t len = strlen(psz);
2970
2971     if (buf)
2972     {
2973         for (size_t c = 0; c <= len; c++)
2974             buf[c] = (unsigned char)(psz[c]);
2975     }
2976
2977     return len;
2978 }
2979
2980 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2981 {
2982     CreateConvIfNeeded();
2983
2984     if (m_convReal)
2985         return m_convReal->WC2MB(buf, psz, n);
2986
2987     // latin-1 (direct)
2988     const size_t len = wxWcslen(psz);
2989     if (buf)
2990     {
2991         for (size_t c = 0; c <= len; c++)
2992         {
2993             if (psz[c] > 0xFF)
2994                 return (size_t)-1;
2995             buf[c] = (char)psz[c];
2996         }
2997     }
2998     else
2999     {
3000         for (size_t c = 0; c <= len; c++)
3001         {
3002             if (psz[c] > 0xFF)
3003                 return (size_t)-1;
3004         }
3005     }
3006
3007     return len;
3008 }
3009
3010 const char *wxCSConv::GetMBNul(size_t *nulLen) const
3011 {
3012     CreateConvIfNeeded();
3013
3014     if ( m_convReal )
3015     {
3016         // cast needed just to call private function of m_convReal
3017         return ((wxCSConv *)m_convReal)->GetMBNul(nulLen);
3018     }
3019
3020     *nulLen = 1;
3021     return "";
3022 }
3023
3024 // ----------------------------------------------------------------------------
3025 // globals
3026 // ----------------------------------------------------------------------------
3027
3028 #ifdef __WINDOWS__
3029     static wxMBConv_win32 wxConvLibcObj;
3030 #elif defined(__WXMAC__) && !defined(__MACH__)
3031     static wxMBConv_mac wxConvLibcObj ;
3032 #else
3033     static wxMBConvLibc wxConvLibcObj;
3034 #endif
3035
3036 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3037 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3038 static wxMBConvUTF7 wxConvUTF7Obj;
3039 static wxMBConvUTF8 wxConvUTF8Obj;
3040
3041 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3042 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3043 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3044 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3045 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3046 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3047 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3048 #ifdef __WXOSX__
3049                                     wxConvUTF8Obj;
3050 #else
3051                                     wxConvLibcObj;
3052 #endif
3053
3054
3055 #else // !wxUSE_WCHAR_T
3056
3057 // stand-ins in absence of wchar_t
3058 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3059                                 wxConvISO8859_1,
3060                                 wxConvLocal,
3061                                 wxConvUTF8;
3062
3063 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T