src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // ----------------------------------------------------------------------------
  90 // UTF-16 en/decoding to/from UCS-4
  91 // ----------------------------------------------------------------------------
  92
  93
  94 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  95 {
  96     if (input<=0xffff)
  97     {
  98         if (output)
  99             *output = (wxUint16) input;
 100         return 1;
 101     }
 102     else if (input>=0x110000)
 103     {
 104         return (size_t)-1;
 105     }
 106     else
 107     {
 108         if (output)
 109         {
 110             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 111             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 112         }
 113         return 2;
 114     }
 115 }
 116
 117 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 118 {
 119     if ((*input<0xd800) || (*input>0xdfff))
 120     {
 121         output = *input;
 122         return 1;
 123     }
 124     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 125     {
 126         output = *input;
 127         return (size_t)-1;
 128     }
 129     else
 130     {
 131         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 132         return 2;
 133     }
 134 }
 135
 136
 137 // ----------------------------------------------------------------------------
 138 // wxMBConv
 139 // ----------------------------------------------------------------------------
 140
 141 wxMBConv::~wxMBConv()
 142 {
 143     // nothing to do here (necessary for Darwin linking probably)
 144 }
 145
 146 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 147 {
 148     if ( psz )
 149     {
 150         // calculate the length of the buffer needed first
 151         size_t nLen = MB2WC(NULL, psz, 0);
 152         if ( nLen != (size_t)-1 )
 153         {
 154             // now do the actual conversion
 155             wxWCharBuffer buf(nLen);
 156             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 157             if ( nLen != (size_t)-1 )
 158             {
 159                 return buf;
 160             }
 161         }
 162     }
 163
 164     wxWCharBuffer buf((wchar_t *)NULL);
 165
 166     return buf;
 167 }
 168
 169 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 170 {
 171     if ( pwz )
 172     {
 173         size_t nLen = WC2MB(NULL, pwz, 0);
 174         if ( nLen != (size_t)-1 )
 175         {
 176             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 177             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 178             if ( nLen != (size_t)-1 )
 179             {
 180                 return buf;
 181             }
 182         }
 183     }
 184
 185     wxCharBuffer buf((char *)NULL);
 186
 187     return buf;
 188 }
 189
 190 const wxWCharBuffer
 191 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 192 {
 193     // the currently accumulated wide characters
 194     wxWCharBuffer wbuf;
 195
 196     // the current length of wbuf
 197     size_t lenBuf = 0;
 198
 199     // we need to know the representation of L'\0' for this conversion
 200     size_t nulLen;
 201     const char * const nul = GetMBNul(&nulLen);
 202     if ( nulLen == (size_t)-1 || nulLen == 0 )
 203         return wxWCharBuffer();
 204
 205     // make a copy of the input string unless it is already properly
 206     // NUL-terminated
 207     wxCharBuffer bufTmp;
 208
 209     // now we can compute the input size if we were not given it: notice that
 210     // in this case the string must be properly NUL-terminated, of course, as
 211     // otherwise we have no way of knowing how long it is
 212     if ( inLen == (size_t)-1 )
 213     {
 214         // not the most efficient algorithm but it shouldn't matter as normally
 215         // there are not many NULs in the string and so normally memcmp()
 216         // should stop on the first character
 217         const char *p = in;
 218         while ( memcmp(p, nul, nulLen) != 0 )
 219             p++;
 220
 221         inLen = p - in + nulLen;
 222     }
 223     else // we already have the size
 224     {
 225         // check if it's not already NUL-terminated too to avoid the copy
 226         if ( inLen < nulLen || memcmp(in + inLen - nulLen, nul, nulLen) != 0 )
 227         {
 228             // make a copy in order to properly NUL-terminate the string
 229             bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
 230             memcpy(bufTmp.data(), in, inLen);
 231             memcpy(bufTmp.data() + inLen, nul, nulLen);
 232         }
 233     }
 234
 235     if ( bufTmp )
 236         in = bufTmp;
 237
 238     for ( const char * const inEnd = in + inLen;; )
 239     {
 240         // try to convert the current chunk if anything left
 241         size_t lenChunk = in < inEnd ? MB2WC(NULL, in, 0) : 0;
 242         if ( lenChunk == 0 )
 243         {
 244             // nothing left in the input string, conversion succeeded
 245             if ( outLen )
 246             {
 247                 // we shouldn't include the last NUL in the result length
 248                 *outLen = lenBuf ? lenBuf - 1 : 0;
 249             }
 250
 251             return wbuf;
 252         }
 253
 254         if ( lenChunk == (size_t)-1 )
 255             break;
 256
 257         const size_t lenBufNew = lenBuf + lenChunk;
 258         if ( !wbuf.extend(lenBufNew) )
 259             break;
 260
 261         lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 262         if ( lenChunk == (size_t)-1 )
 263             break;
 264
 265         // +! for the embedded NUL (if something follows)
 266         lenBuf = lenBufNew + 1;
 267
 268         // advance the input pointer past the end of this chunk
 269         while ( memcmp(in, nul, nulLen) != 0 )
 270             in++;
 271
 272         in += nulLen; // skipping over its terminator as well
 273     }
 274
 275     // conversion failed
 276     if ( outLen )
 277         *outLen = 0;
 278
 279     return wxWCharBuffer();
 280 }
 281
 282 const wxCharBuffer
 283 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 284 {
 285     // the currently accumulated multibyte characters
 286     wxCharBuffer buf;
 287
 288     // the current length of buf
 289     size_t lenBuf = 0;
 290
 291     // make a copy of the input string unless it is already properly
 292     // NUL-terminated
 293     //
 294     // if we don't know its length we have no choice but to assume that it is,
 295     // indeed, properly terminated
 296     wxWCharBuffer bufTmp;
 297     if ( inLen == (size_t)-1 )
 298     {
 299         inLen = wxWcslen(in) + 1;
 300     }
 301     else if ( inLen != 0 && in[inLen - 1] != L'\0' )
 302     {
 303         // make a copy in order to properly NUL-terminate the string
 304         bufTmp = wxWCharBuffer(inLen);
 305         memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t));
 306     }
 307
 308     if ( bufTmp )
 309         in = bufTmp;
 310
 311     for ( const wchar_t * const inEnd = in + inLen;; )
 312     {
 313         // try to convert the current chunk, if anything left
 314         size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0;
 315         if ( lenChunk == 0 )
 316         {
 317             // nothing left in the input string, conversion succeeded
 318             if ( outLen )
 319                 *outLen = lenBuf ? lenBuf - 1 : lenBuf;
 320
 321             return buf;
 322         }
 323
 324         if ( lenChunk == (size_t)-1 )
 325             break;
 326
 327         const size_t lenBufNew = lenBuf + lenChunk;
 328         if ( !buf.extend(lenBufNew) )
 329             break;
 330
 331         lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 332         if ( lenChunk == (size_t)-1 )
 333             break;
 334
 335         // chunk successfully converted, go to the next one
 336         in += wxWcslen(in) + 1 /* skip NUL too */;
 337         lenBuf = lenBufNew + 1;
 338     }
 339
 340     // conversion failed
 341     if ( outLen )
 342         *outLen = 0;
 343
 344     return wxCharBuffer();
 345 }
 346
 347 // ----------------------------------------------------------------------------
 348 // wxMBConvLibc
 349 // ----------------------------------------------------------------------------
 350
 351 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 352 {
 353     return wxMB2WC(buf, psz, n);
 354 }
 355
 356 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 357 {
 358     return wxWC2MB(buf, psz, n);
 359 }
 360
 361 // ----------------------------------------------------------------------------
 362 // wxConvBrokenFileNames
 363 // ----------------------------------------------------------------------------
 364
 365 #ifdef __UNIX__
 366
 367 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 368 {
 369     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 370                   || wxStricmp(charset, _T("UTF8")) == 0  )
 371         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 372     else
 373         m_conv = new wxCSConv(charset);
 374 }
 375
 376 #endif // __UNIX__
 377
 378 // ----------------------------------------------------------------------------
 379 // UTF-7
 380 // ----------------------------------------------------------------------------
 381
 382 // Implementation (C) 2004 Fredrik Roubert
 383
 384 //
 385 // BASE64 decoding table
 386 //
 387 static const unsigned char utf7unb64[] =
 388 {
 389     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 390     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 391     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 392     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 393     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 394     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 395     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 396     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 397     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 398     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 399     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 400     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 401     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 402     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 403     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 404     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 405     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 406     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 407     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 408     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 409     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 410     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 411     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 412     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 413     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 414     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 415     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 416     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 417     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 418     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 419     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 420     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 421 };
 422
 423 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 424 {
 425     size_t len = 0;
 426
 427     while ( *psz && (!buf || (len < n)) )
 428     {
 429         unsigned char cc = *psz++;
 430         if (cc != '+')
 431         {
 432             // plain ASCII char
 433             if (buf)
 434                 *buf++ = cc;
 435             len++;
 436         }
 437         else if (*psz == '-')
 438         {
 439             // encoded plus sign
 440             if (buf)
 441                 *buf++ = cc;
 442             len++;
 443             psz++;
 444         }
 445         else // start of BASE64 encoded string
 446         {
 447             bool lsb, ok;
 448             unsigned int d, l;
 449             for ( ok = lsb = false, d = 0, l = 0;
 450                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 451                   psz++ )
 452             {
 453                 d <<= 6;
 454                 d += cc;
 455                 for (l += 6; l >= 8; lsb = !lsb)
 456                 {
 457                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 458                     if (lsb)
 459                     {
 460                         if (buf)
 461                             *buf++ |= c;
 462                         len ++;
 463                     }
 464                     else
 465                     {
 466                         if (buf)
 467                             *buf = (wchar_t)(c << 8);
 468                     }
 469
 470                     ok = true;
 471                 }
 472             }
 473
 474             if ( !ok )
 475             {
 476                 // in valid UTF7 we should have valid characters after '+'
 477                 return (size_t)-1;
 478             }
 479
 480             if (*psz == '-')
 481                 psz++;
 482         }
 483     }
 484
 485     if ( buf && (len < n) )
 486         *buf = '\0';
 487
 488     return len;
 489 }
 490
 491 //
 492 // BASE64 encoding table
 493 //
 494 static const unsigned char utf7enb64[] =
 495 {
 496     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 497     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 498     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 499     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 500     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 501     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 502     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 503     '4', '5', '6', '7', '8', '9', '+', '/'
 504 };
 505
 506 //
 507 // UTF-7 encoding table
 508 //
 509 // 0 - Set D (directly encoded characters)
 510 // 1 - Set O (optional direct characters)
 511 // 2 - whitespace characters (optional)
 512 // 3 - special characters
 513 //
 514 static const unsigned char utf7encode[128] =
 515 {
 516     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 517     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 518     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 519     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 520     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 521     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 522     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 523     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 524 };
 525
 526 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 527 {
 528     size_t len = 0;
 529
 530     while (*psz && ((!buf) || (len < n)))
 531     {
 532         wchar_t cc = *psz++;
 533         if (cc < 0x80 && utf7encode[cc] < 1)
 534         {
 535             // plain ASCII char
 536             if (buf)
 537                 *buf++ = (char)cc;
 538             len++;
 539         }
 540 #ifndef WC_UTF16
 541         else if (((wxUint32)cc) > 0xffff)
 542         {
 543             // no surrogate pair generation (yet?)
 544             return (size_t)-1;
 545         }
 546 #endif
 547         else
 548         {
 549             if (buf)
 550                 *buf++ = '+';
 551             len++;
 552             if (cc != '+')
 553             {
 554                 // BASE64 encode string
 555                 unsigned int lsb, d, l;
 556                 for (d = 0, l = 0; /*nothing*/; psz++)
 557                 {
 558                     for (lsb = 0; lsb < 2; lsb ++)
 559                     {
 560                         d <<= 8;
 561                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 562
 563                         for (l += 8; l >= 6; )
 564                         {
 565                             l -= 6;
 566                             if (buf)
 567                                 *buf++ = utf7enb64[(d >> l) % 64];
 568                             len++;
 569                         }
 570                     }
 571                     cc = *psz;
 572                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 573                         break;
 574                 }
 575                 if (l != 0)
 576                 {
 577                     if (buf)
 578                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 579                     len++;
 580                 }
 581             }
 582             if (buf)
 583                 *buf++ = '-';
 584             len++;
 585         }
 586     }
 587     if (buf && (len < n))
 588         *buf = 0;
 589     return len;
 590 }
 591
 592 // ----------------------------------------------------------------------------
 593 // UTF-8
 594 // ----------------------------------------------------------------------------
 595
 596 static wxUint32 utf8_max[]=
 597     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 598
 599 // boundaries of the private use area we use to (temporarily) remap invalid
 600 // characters invalid in a UTF-8 encoded string
 601 const wxUint32 wxUnicodePUA = 0x100000;
 602 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 603
 604 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 605 {
 606     size_t len = 0;
 607
 608     while (*psz && ((!buf) || (len < n)))
 609     {
 610         const char *opsz = psz;
 611         bool invalid = false;
 612         unsigned char cc = *psz++, fc = cc;
 613         unsigned cnt;
 614         for (cnt = 0; fc & 0x80; cnt++)
 615             fc <<= 1;
 616         if (!cnt)
 617         {
 618             // plain ASCII char
 619             if (buf)
 620                 *buf++ = cc;
 621             len++;
 622
 623             // escape the escape character for octal escapes
 624             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 625                     && cc == '\\' && (!buf || len < n))
 626             {
 627                 if (buf)
 628                     *buf++ = cc;
 629                 len++;
 630             }
 631         }
 632         else
 633         {
 634             cnt--;
 635             if (!cnt)
 636             {
 637                 // invalid UTF-8 sequence
 638                 invalid = true;
 639             }
 640             else
 641             {
 642                 unsigned ocnt = cnt - 1;
 643                 wxUint32 res = cc & (0x3f >> cnt);
 644                 while (cnt--)
 645                 {
 646                     cc = *psz;
 647                     if ((cc & 0xC0) != 0x80)
 648                     {
 649                         // invalid UTF-8 sequence
 650                         invalid = true;
 651                         break;
 652                     }
 653                     psz++;
 654                     res = (res << 6) | (cc & 0x3f);
 655                 }
 656                 if (invalid || res <= utf8_max[ocnt])
 657                 {
 658                     // illegal UTF-8 encoding
 659                     invalid = true;
 660                 }
 661                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 662                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 663                 {
 664                     // if one of our PUA characters turns up externally
 665                     // it must also be treated as an illegal sequence
 666                     // (a bit like you have to escape an escape character)
 667                     invalid = true;
 668                 }
 669                 else
 670                 {
 671 #ifdef WC_UTF16
 672                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 673                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 674                     if (pa == (size_t)-1)
 675                     {
 676                         invalid = true;
 677                     }
 678                     else
 679                     {
 680                         if (buf)
 681                             buf += pa;
 682                         len += pa;
 683                     }
 684 #else // !WC_UTF16
 685                     if (buf)
 686                         *buf++ = (wchar_t)res;
 687                     len++;
 688 #endif // WC_UTF16/!WC_UTF16
 689                 }
 690             }
 691             if (invalid)
 692             {
 693                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 694                 {
 695                     while (opsz < psz && (!buf || len < n))
 696                     {
 697 #ifdef WC_UTF16
 698                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 699                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 700                         wxASSERT(pa != (size_t)-1);
 701                         if (buf)
 702                             buf += pa;
 703                         opsz++;
 704                         len += pa;
 705 #else
 706                         if (buf)
 707                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 708                         opsz++;
 709                         len++;
 710 #endif
 711                     }
 712                 }
 713                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 714                 {
 715                     while (opsz < psz && (!buf || len < n))
 716                     {
 717                         if ( buf && len + 3 < n )
 718                         {
 719                             unsigned char on = *opsz;
 720                             *buf++ = L'\\';
 721                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 722                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 723                             *buf++ = (wchar_t)( L'0' + on % 010 );
 724                         }
 725                         opsz++;
 726                         len += 4;
 727                     }
 728                 }
 729                 else // MAP_INVALID_UTF8_NOT
 730                 {
 731                     return (size_t)-1;
 732                 }
 733             }
 734         }
 735     }
 736     if (buf && (len < n))
 737         *buf = 0;
 738     return len;
 739 }
 740
 741 static inline bool isoctal(wchar_t wch)
 742 {
 743     return L'0' <= wch && wch <= L'7';
 744 }
 745
 746 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 747 {
 748     size_t len = 0;
 749
 750     while (*psz && ((!buf) || (len < n)))
 751     {
 752         wxUint32 cc;
 753 #ifdef WC_UTF16
 754         // cast is ok for WC_UTF16
 755         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 756         psz += (pa == (size_t)-1) ? 1 : pa;
 757 #else
 758         cc=(*psz++) & 0x7fffffff;
 759 #endif
 760
 761         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 762                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 763         {
 764             if (buf)
 765                 *buf++ = (char)(cc - wxUnicodePUA);
 766             len++;
 767         }
 768         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 769                     && cc == L'\\' && psz[0] == L'\\' )
 770         {
 771             if (buf)
 772                 *buf++ = (char)cc;
 773             psz++;
 774             len++;
 775         }
 776         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 777                     cc == L'\\' &&
 778                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 779         {
 780             if (buf)
 781             {
 782                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 783                                  (psz[1] - L'0')*010 +
 784                                  (psz[2] - L'0'));
 785             }
 786
 787             psz += 3;
 788             len++;
 789         }
 790         else
 791         {
 792             unsigned cnt;
 793             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 794             if (!cnt)
 795             {
 796                 // plain ASCII char
 797                 if (buf)
 798                     *buf++ = (char) cc;
 799                 len++;
 800             }
 801
 802             else
 803             {
 804                 len += cnt + 1;
 805                 if (buf)
 806                 {
 807                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 808                     while (cnt--)
 809                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 810                 }
 811             }
 812         }
 813     }
 814
 815     if (buf && (len<n))
 816         *buf = 0;
 817
 818     return len;
 819 }
 820
 821 // ----------------------------------------------------------------------------
 822 // UTF-16
 823 // ----------------------------------------------------------------------------
 824
 825 #ifdef WORDS_BIGENDIAN
 826     #define wxMBConvUTF16straight wxMBConvUTF16BE
 827     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 828 #else
 829     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 830     #define wxMBConvUTF16straight wxMBConvUTF16LE
 831 #endif
 832
 833
 834 #ifdef WC_UTF16
 835
 836 // copy 16bit MB to 16bit String
 837 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 838 {
 839     size_t len=0;
 840
 841     while (*(wxUint16*)psz && (!buf || len < n))
 842     {
 843         if (buf)
 844             *buf++ = *(wxUint16*)psz;
 845         len++;
 846
 847         psz += sizeof(wxUint16);
 848     }
 849     if (buf && len<n)   *buf=0;
 850
 851     return len;
 852 }
 853
 854
 855 // copy 16bit String to 16bit MB
 856 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 857 {
 858     size_t len=0;
 859
 860     while (*psz && (!buf || len < n))
 861     {
 862         if (buf)
 863         {
 864             *(wxUint16*)buf = *psz;
 865             buf += sizeof(wxUint16);
 866         }
 867         len += sizeof(wxUint16);
 868         psz++;
 869     }
 870     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 871
 872     return len;
 873 }
 874
 875
 876 // swap 16bit MB to 16bit String
 877 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 878 {
 879     size_t len = 0;
 880
 881     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 882     // inside the string
 883     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 884     {
 885         if ( buf )
 886         {
 887             ((char *)buf)[0] = psz[1];
 888             ((char *)buf)[1] = psz[0];
 889             buf++;
 890         }
 891         len++;
 892         psz += 2;
 893     }
 894
 895     if ( buf && len < n )
 896         *buf = L'\0';
 897
 898     return len;
 899 }
 900
 901
 902 // swap 16bit MB to 16bit String
 903 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 904 {
 905     size_t len = 0;
 906
 907     while ( *psz && (!buf || len < n) )
 908     {
 909         if ( buf )
 910         {
 911             *buf++ = ((char*)psz)[1];
 912             *buf++ = ((char*)psz)[0];
 913         }
 914         len += 2;
 915         psz++;
 916     }
 917
 918     if ( buf && len < n )
 919         *buf = '\0';
 920
 921     return len;
 922 }
 923
 924
 925 #else // WC_UTF16
 926
 927
 928 // copy 16bit MB to 32bit String
 929 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 930 {
 931     size_t len=0;
 932
 933     while (*(wxUint16*)psz && (!buf || len < n))
 934     {
 935         wxUint32 cc;
 936         size_t pa=decode_utf16((wxUint16*)psz, cc);
 937         if (pa == (size_t)-1)
 938             return pa;
 939
 940         if (buf)
 941             *buf++ = (wchar_t)cc;
 942         len++;
 943         psz += pa * sizeof(wxUint16);
 944     }
 945     if (buf && len<n)   *buf=0;
 946
 947     return len;
 948 }
 949
 950
 951 // copy 32bit String to 16bit MB
 952 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 953 {
 954     size_t len=0;
 955
 956     while (*psz && (!buf || len < n))
 957     {
 958         wxUint16 cc[2];
 959         size_t pa=encode_utf16(*psz, cc);
 960
 961         if (pa == (size_t)-1)
 962             return pa;
 963
 964         if (buf)
 965         {
 966             *(wxUint16*)buf = cc[0];
 967             buf += sizeof(wxUint16);
 968             if (pa > 1)
 969             {
 970                 *(wxUint16*)buf = cc[1];
 971                 buf += sizeof(wxUint16);
 972             }
 973         }
 974
 975         len += pa*sizeof(wxUint16);
 976         psz++;
 977     }
 978     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 979
 980     return len;
 981 }
 982
 983
 984 // swap 16bit MB to 32bit String
 985 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 986 {
 987     size_t len=0;
 988
 989     while (*(wxUint16*)psz && (!buf || len < n))
 990     {
 991         wxUint32 cc;
 992         char tmp[4];
 993         tmp[0]=psz[1];  tmp[1]=psz[0];
 994         tmp[2]=psz[3];  tmp[3]=psz[2];
 995
 996         size_t pa=decode_utf16((wxUint16*)tmp, cc);
 997         if (pa == (size_t)-1)
 998             return pa;
 999
1000         if (buf)
1001             *buf++ = (wchar_t)cc;
1002
1003         len++;
1004         psz += pa * sizeof(wxUint16);
1005     }
1006     if (buf && len<n)   *buf=0;
1007
1008     return len;
1009 }
1010
1011
1012 // swap 32bit String to 16bit MB
1013 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1014 {
1015     size_t len=0;
1016
1017     while (*psz && (!buf || len < n))
1018     {
1019         wxUint16 cc[2];
1020         size_t pa=encode_utf16(*psz, cc);
1021
1022         if (pa == (size_t)-1)
1023             return pa;
1024
1025         if (buf)
1026         {
1027             *buf++ = ((char*)cc)[1];
1028             *buf++ = ((char*)cc)[0];
1029             if (pa > 1)
1030             {
1031                 *buf++ = ((char*)cc)[3];
1032                 *buf++ = ((char*)cc)[2];
1033             }
1034         }
1035
1036         len += pa*sizeof(wxUint16);
1037         psz++;
1038     }
1039     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1040
1041     return len;
1042 }
1043
1044 #endif // WC_UTF16
1045
1046
1047 // ----------------------------------------------------------------------------
1048 // UTF-32
1049 // ----------------------------------------------------------------------------
1050
1051 #ifdef WORDS_BIGENDIAN
1052 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1053 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1054 #else
1055 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1056 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1057 #endif
1058
1059
1060 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1061 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1062
1063
1064 #ifdef WC_UTF16
1065
1066 // copy 32bit MB to 16bit String
1067 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1068 {
1069     size_t len=0;
1070
1071     while (*(wxUint32*)psz && (!buf || len < n))
1072     {
1073         wxUint16 cc[2];
1074
1075         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1076         if (pa == (size_t)-1)
1077             return pa;
1078
1079         if (buf)
1080         {
1081             *buf++ = cc[0];
1082             if (pa > 1)
1083                 *buf++ = cc[1];
1084         }
1085         len += pa;
1086         psz += sizeof(wxUint32);
1087     }
1088     if (buf && len<n)   *buf=0;
1089
1090     return len;
1091 }
1092
1093
1094 // copy 16bit String to 32bit MB
1095 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1096 {
1097     size_t len=0;
1098
1099     while (*psz && (!buf || len < n))
1100     {
1101         wxUint32 cc;
1102
1103         // cast is ok for WC_UTF16
1104         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1105         if (pa == (size_t)-1)
1106             return pa;
1107
1108         if (buf)
1109         {
1110             *(wxUint32*)buf = cc;
1111             buf += sizeof(wxUint32);
1112         }
1113         len += sizeof(wxUint32);
1114         psz += pa;
1115     }
1116
1117     if (buf && len<=n-sizeof(wxUint32))
1118         *(wxUint32*)buf=0;
1119
1120     return len;
1121 }
1122
1123
1124
1125 // swap 32bit MB to 16bit String
1126 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1127 {
1128     size_t len=0;
1129
1130     while (*(wxUint32*)psz && (!buf || len < n))
1131     {
1132         char tmp[4];
1133         tmp[0] = psz[3];   tmp[1] = psz[2];
1134         tmp[2] = psz[1];   tmp[3] = psz[0];
1135
1136
1137         wxUint16 cc[2];
1138
1139         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1140         if (pa == (size_t)-1)
1141             return pa;
1142
1143         if (buf)
1144         {
1145             *buf++ = cc[0];
1146             if (pa > 1)
1147                 *buf++ = cc[1];
1148         }
1149         len += pa;
1150         psz += sizeof(wxUint32);
1151     }
1152
1153     if (buf && len<n)
1154         *buf=0;
1155
1156     return len;
1157 }
1158
1159
1160 // swap 16bit String to 32bit MB
1161 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1162 {
1163     size_t len=0;
1164
1165     while (*psz && (!buf || len < n))
1166     {
1167         char cc[4];
1168
1169         // cast is ok for WC_UTF16
1170         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1171         if (pa == (size_t)-1)
1172             return pa;
1173
1174         if (buf)
1175         {
1176             *buf++ = cc[3];
1177             *buf++ = cc[2];
1178             *buf++ = cc[1];
1179             *buf++ = cc[0];
1180         }
1181         len += sizeof(wxUint32);
1182         psz += pa;
1183     }
1184
1185     if (buf && len<=n-sizeof(wxUint32))
1186         *(wxUint32*)buf=0;
1187
1188     return len;
1189 }
1190
1191 #else // WC_UTF16
1192
1193
1194 // copy 32bit MB to 32bit String
1195 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1196 {
1197     size_t len=0;
1198
1199     while (*(wxUint32*)psz && (!buf || len < n))
1200     {
1201         if (buf)
1202             *buf++ = (wchar_t)(*(wxUint32*)psz);
1203         len++;
1204         psz += sizeof(wxUint32);
1205     }
1206
1207     if (buf && len<n)
1208         *buf=0;
1209
1210     return len;
1211 }
1212
1213
1214 // copy 32bit String to 32bit MB
1215 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1216 {
1217     size_t len=0;
1218
1219     while (*psz && (!buf || len < n))
1220     {
1221         if (buf)
1222         {
1223             *(wxUint32*)buf = *psz;
1224             buf += sizeof(wxUint32);
1225         }
1226
1227         len += sizeof(wxUint32);
1228         psz++;
1229     }
1230
1231     if (buf && len<=n-sizeof(wxUint32))
1232         *(wxUint32*)buf=0;
1233
1234     return len;
1235 }
1236
1237
1238 // swap 32bit MB to 32bit String
1239 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1240 {
1241     size_t len=0;
1242
1243     while (*(wxUint32*)psz && (!buf || len < n))
1244     {
1245         if (buf)
1246         {
1247             ((char *)buf)[0] = psz[3];
1248             ((char *)buf)[1] = psz[2];
1249             ((char *)buf)[2] = psz[1];
1250             ((char *)buf)[3] = psz[0];
1251             buf++;
1252         }
1253         len++;
1254         psz += sizeof(wxUint32);
1255     }
1256
1257     if (buf && len<n)
1258         *buf=0;
1259
1260     return len;
1261 }
1262
1263
1264 // swap 32bit String to 32bit MB
1265 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1266 {
1267     size_t len=0;
1268
1269     while (*psz && (!buf || len < n))
1270     {
1271         if (buf)
1272         {
1273             *buf++ = ((char *)psz)[3];
1274             *buf++ = ((char *)psz)[2];
1275             *buf++ = ((char *)psz)[1];
1276             *buf++ = ((char *)psz)[0];
1277         }
1278         len += sizeof(wxUint32);
1279         psz++;
1280     }
1281
1282     if (buf && len<=n-sizeof(wxUint32))
1283         *(wxUint32*)buf=0;
1284
1285     return len;
1286 }
1287
1288
1289 #endif // WC_UTF16
1290
1291
1292 // ============================================================================
1293 // The classes doing conversion using the iconv_xxx() functions
1294 // ============================================================================
1295
1296 #ifdef HAVE_ICONV
1297
1298 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1299 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1300 //     (unless there's yet another bug in glibc) the only case when iconv()
1301 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1302 //     left in the input buffer -- when _real_ error occurs,
1303 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1304 //     iconv() failure.
1305 //     [This bug does not appear in glibc 2.2.]
1306 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1307 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1308                                      (errno != E2BIG || bufLeft != 0))
1309 #else
1310 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1311 #endif
1312
1313 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1314
1315 #define ICONV_T_INVALID ((iconv_t)-1)
1316
1317 #if SIZEOF_WCHAR_T == 4
1318     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1319     #define WC_ENC      wxFONTENCODING_UTF32
1320 #elif SIZEOF_WCHAR_T == 2
1321     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1322     #define WC_ENC      wxFONTENCODING_UTF16
1323 #else // sizeof(wchar_t) != 2 nor 4
1324     // does this ever happen?
1325     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1326 #endif
1327
1328 // ----------------------------------------------------------------------------
1329 // wxMBConv_iconv: encapsulates an iconv character set
1330 // ----------------------------------------------------------------------------
1331
1332 class wxMBConv_iconv : public wxMBConv
1333 {
1334 public:
1335     wxMBConv_iconv(const wxChar *name);
1336     virtual ~wxMBConv_iconv();
1337
1338     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1339     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1340
1341     bool IsOk() const
1342         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1343
1344 protected:
1345     // the iconv handlers used to translate from multibyte to wide char and in
1346     // the other direction
1347     iconv_t m2w,
1348             w2m;
1349 #if wxUSE_THREADS
1350     // guards access to m2w and w2m objects
1351     wxMutex m_iconvMutex;
1352 #endif
1353
1354 private:
1355     virtual const char *GetMBNul(size_t *nulLen) const;
1356
1357     // the name (for iconv_open()) of a wide char charset -- if none is
1358     // available on this machine, it will remain NULL
1359     static wxString ms_wcCharsetName;
1360
1361     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1362     // different endian-ness than the native one
1363     static bool ms_wcNeedsSwap;
1364
1365     // NUL representation
1366     size_t m_nulLen;
1367     char m_nulBuf[8];
1368 };
1369
1370 // make the constructor available for unit testing
1371 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1372 {
1373     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1374     if ( !result->IsOk() )
1375     {
1376         delete result;
1377         return 0;
1378     }
1379     return result;
1380 }
1381
1382 wxString wxMBConv_iconv::ms_wcCharsetName;
1383 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1384
1385 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1386 {
1387     m_nulLen = (size_t)-2;
1388
1389     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1390     // names for the charsets
1391     const wxCharBuffer cname(wxString(name).ToAscii());
1392
1393     // check for charset that represents wchar_t:
1394     if ( ms_wcCharsetName.empty() )
1395     {
1396         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1397
1398 #if wxUSE_FONTMAP
1399         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1400 #else // !wxUSE_FONTMAP
1401         static const wxChar *names[] =
1402         {
1403 #if SIZEOF_WCHAR_T == 4
1404             _T("UCS-4"),
1405 #elif SIZEOF_WCHAR_T = 2
1406             _T("UCS-2"),
1407 #endif
1408             NULL
1409         };
1410 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1411
1412         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1413         {
1414             const wxString nameCS(*names);
1415
1416             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1417             wxString nameXE(nameCS);
1418             #ifdef WORDS_BIGENDIAN
1419                 nameXE += _T("BE");
1420             #else // little endian
1421                 nameXE += _T("LE");
1422             #endif
1423
1424             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1425                        nameXE.c_str());
1426
1427             m2w = iconv_open(nameXE.ToAscii(), cname);
1428             if ( m2w == ICONV_T_INVALID )
1429             {
1430                 // try charset w/o bytesex info (e.g. "UCS4")
1431                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1432                            nameCS.c_str());
1433                 m2w = iconv_open(nameCS.ToAscii(), cname);
1434
1435                 // and check for bytesex ourselves:
1436                 if ( m2w != ICONV_T_INVALID )
1437                 {
1438                     char    buf[2], *bufPtr;
1439                     wchar_t wbuf[2], *wbufPtr;
1440                     size_t  insz, outsz;
1441                     size_t  res;
1442
1443                     buf[0] = 'A';
1444                     buf[1] = 0;
1445                     wbuf[0] = 0;
1446                     insz = 2;
1447                     outsz = SIZEOF_WCHAR_T * 2;
1448                     wbufPtr = wbuf;
1449                     bufPtr = buf;
1450
1451                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1452                                 (char**)&wbufPtr, &outsz);
1453
1454                     if (ICONV_FAILED(res, insz))
1455                     {
1456                         wxLogLastError(wxT("iconv"));
1457                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1458                                    nameCS.c_str());
1459                     }
1460                     else // ok, can convert to this encoding, remember it
1461                     {
1462                         ms_wcCharsetName = nameCS;
1463                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1464                     }
1465                 }
1466             }
1467             else // use charset not requiring byte swapping
1468             {
1469                 ms_wcCharsetName = nameXE;
1470             }
1471         }
1472
1473         wxLogTrace(TRACE_STRCONV,
1474                    wxT("iconv wchar_t charset is \"%s\"%s"),
1475                    ms_wcCharsetName.empty() ? _T("<none>")
1476                                             : ms_wcCharsetName.c_str(),
1477                    ms_wcNeedsSwap ? _T(" (needs swap)")
1478                                   : _T(""));
1479     }
1480     else // we already have ms_wcCharsetName
1481     {
1482         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1483     }
1484
1485     if ( ms_wcCharsetName.empty() )
1486     {
1487         w2m = ICONV_T_INVALID;
1488     }
1489     else
1490     {
1491         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1492         if ( w2m == ICONV_T_INVALID )
1493         {
1494             wxLogTrace(TRACE_STRCONV,
1495                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1496                        ms_wcCharsetName.c_str(), cname.data());
1497         }
1498     }
1499 }
1500
1501 wxMBConv_iconv::~wxMBConv_iconv()
1502 {
1503     if ( m2w != ICONV_T_INVALID )
1504         iconv_close(m2w);
1505     if ( w2m != ICONV_T_INVALID )
1506         iconv_close(w2m);
1507 }
1508
1509 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1510 {
1511 #if wxUSE_THREADS
1512     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1513     //     Unfortunately there is a couple of global wxCSConv objects such as
1514     //     wxConvLocal that are used all over wx code, so we have to make sure
1515     //     the handle is used by at most one thread at the time. Otherwise
1516     //     only a few wx classes would be safe to use from non-main threads
1517     //     as MB<->WC conversion would fail "randomly".
1518     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1519 #endif
1520
1521     size_t inbuf = strlen(psz);
1522     size_t outbuf = n * SIZEOF_WCHAR_T;
1523     size_t res, cres;
1524     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1525     wchar_t *bufPtr = buf;
1526     const char *pszPtr = psz;
1527
1528     if (buf)
1529     {
1530         // have destination buffer, convert there
1531         cres = iconv(m2w,
1532                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1533                      (char**)&bufPtr, &outbuf);
1534         res = n - (outbuf / SIZEOF_WCHAR_T);
1535
1536         if (ms_wcNeedsSwap)
1537         {
1538             // convert to native endianness
1539             for ( unsigned i = 0; i < res; i++ )
1540                 buf[n] = WC_BSWAP(buf[i]);
1541         }
1542
1543         // NB: iconv was given only strlen(psz) characters on input, and so
1544         //     it couldn't convert the trailing zero. Let's do it ourselves
1545         //     if there's some room left for it in the output buffer.
1546         if (res < n)
1547             buf[res] = 0;
1548     }
1549     else
1550     {
1551         // no destination buffer... convert using temp buffer
1552         // to calculate destination buffer requirement
1553         wchar_t tbuf[8];
1554         res = 0;
1555         do {
1556             bufPtr = tbuf;
1557             outbuf = 8*SIZEOF_WCHAR_T;
1558
1559             cres = iconv(m2w,
1560                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1561                          (char**)&bufPtr, &outbuf );
1562
1563             res += 8-(outbuf/SIZEOF_WCHAR_T);
1564         } while ((cres==(size_t)-1) && (errno==E2BIG));
1565     }
1566
1567     if (ICONV_FAILED(cres, inbuf))
1568     {
1569         //VS: it is ok if iconv fails, hence trace only
1570         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1571         return (size_t)-1;
1572     }
1573
1574     return res;
1575 }
1576
1577 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1578 {
1579 #if wxUSE_THREADS
1580     // NB: explained in MB2WC
1581     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1582 #endif
1583
1584     size_t inlen = wxWcslen(psz);
1585     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1586     size_t outbuf = n;
1587     size_t res, cres;
1588
1589     wchar_t *tmpbuf = 0;
1590
1591     if (ms_wcNeedsSwap)
1592     {
1593         // need to copy to temp buffer to switch endianness
1594         // (doing WC_BSWAP twice on the original buffer won't help, as it
1595         //  could be in read-only memory, or be accessed in some other thread)
1596         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1597         for ( size_t i = 0; i < inlen; i++ )
1598             tmpbuf[n] = WC_BSWAP(psz[i]);
1599         tmpbuf[inlen] = L'\0';
1600         psz = tmpbuf;
1601     }
1602
1603     if (buf)
1604     {
1605         // have destination buffer, convert there
1606         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1607
1608         res = n-outbuf;
1609
1610         // NB: iconv was given only wcslen(psz) characters on input, and so
1611         //     it couldn't convert the trailing zero. Let's do it ourselves
1612         //     if there's some room left for it in the output buffer.
1613         if (res < n)
1614             buf[0] = 0;
1615     }
1616     else
1617     {
1618         // no destination buffer... convert using temp buffer
1619         // to calculate destination buffer requirement
1620         char tbuf[16];
1621         res = 0;
1622         do {
1623             buf = tbuf; outbuf = 16;
1624
1625             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1626
1627             res += 16 - outbuf;
1628         } while ((cres==(size_t)-1) && (errno==E2BIG));
1629     }
1630
1631     if (ms_wcNeedsSwap)
1632     {
1633         free(tmpbuf);
1634     }
1635
1636     if (ICONV_FAILED(cres, inbuf))
1637     {
1638         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1639         return (size_t)-1;
1640     }
1641
1642     return res;
1643 }
1644
1645 const char *wxMBConv_iconv::GetMBNul(size_t *nulLen) const
1646 {
1647     if ( m_nulLen == (size_t)-2 )
1648     {
1649         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1650
1651 #if wxUSE_THREADS
1652         // NB: explained in MB2WC
1653         wxMutexLocker lock(self->m_iconvMutex);
1654 #endif
1655
1656         wchar_t *wnul = L"";
1657         size_t inLen = sizeof(wchar_t),
1658                outLen = WXSIZEOF(m_nulBuf);
1659         const char *in = (char *)wnul;
1660         char *out = self->m_nulBuf;
1661         if ( iconv(w2m, &in, &inLen, &out, &outLen) == (size_t)-1 )
1662         {
1663             self->m_nulLen = (size_t)-1;
1664         }
1665         else // ok
1666         {
1667             self->m_nulLen = out - m_nulBuf;
1668         }
1669     }
1670
1671     *nulLen = m_nulLen;
1672     return m_nulBuf;
1673 }
1674
1675 #endif // HAVE_ICONV
1676
1677
1678 // ============================================================================
1679 // Win32 conversion classes
1680 // ============================================================================
1681
1682 #ifdef wxHAVE_WIN32_MB2WC
1683
1684 // from utils.cpp
1685 #if wxUSE_FONTMAP
1686 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1687 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1688 #endif
1689
1690 class wxMBConv_win32 : public wxMBConv
1691 {
1692 public:
1693     wxMBConv_win32()
1694     {
1695         m_CodePage = CP_ACP;
1696         m_nulLen = (size_t)-2;
1697     }
1698
1699 #if wxUSE_FONTMAP
1700     wxMBConv_win32(const wxChar* name)
1701     {
1702         m_CodePage = wxCharsetToCodepage(name);
1703         m_nulLen = (size_t)-2;
1704     }
1705
1706     wxMBConv_win32(wxFontEncoding encoding)
1707     {
1708         m_CodePage = wxEncodingToCodepage(encoding);
1709         m_nulLen = (size_t)-2;
1710     }
1711 #endif // wxUSE_FONTMAP
1712
1713     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1714     {
1715         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1716         // the behaviour is not compatible with the Unix version (using iconv)
1717         // and break the library itself, e.g. wxTextInputStream::NextChar()
1718         // wouldn't work if reading an incomplete MB char didn't result in an
1719         // error
1720         //
1721         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1722         // an error (tested under Windows Server 2003) and apparently it is
1723         // done on purpose, i.e. the function accepts any input in this case
1724         // and although I'd prefer to return error on ill-formed output, our
1725         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1726         // explicitly ill-formed according to RFC 2152) neither so we don't
1727         // even have any fallback here...
1728         //
1729         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1730         // Win XP or newer and if it is specified on older versions, conversion
1731         // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1732         // fails. So we can only use the flag on newer Windows versions.
1733         // Additionally, the flag is not supported by UTF7, symbol and CJK
1734         // encodings. See here:
1735         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1736         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1737         int flags = 0;
1738         if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1739              m_CodePage < 50000 &&
1740              IsAtLeastWin2kSP4() )
1741         {
1742             flags = MB_ERR_INVALID_CHARS;
1743         }
1744         else if ( m_CodePage == CP_UTF8 )
1745         {
1746             // Avoid round-trip in the special case of UTF-8 by using our
1747             // own UTF-8 conversion code:
1748             return wxMBConvUTF8().MB2WC(buf, psz, n);
1749         }
1750
1751         const size_t len = ::MultiByteToWideChar
1752                              (
1753                                 m_CodePage,     // code page
1754                                 flags,          // flags: fall on error
1755                                 psz,            // input string
1756                                 -1,             // its length (NUL-terminated)
1757                                 buf,            // output string
1758                                 buf ? n : 0     // size of output buffer
1759                              );
1760         if ( !len )
1761         {
1762             // function totally failed
1763             return (size_t)-1;
1764         }
1765
1766         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1767         // check if we succeeded, by doing a double trip:
1768         if ( !flags && buf )
1769         {
1770             const size_t mbLen = strlen(psz);
1771             wxCharBuffer mbBuf(mbLen);
1772             if ( ::WideCharToMultiByte
1773                    (
1774                       m_CodePage,
1775                       0,
1776                       buf,
1777                       -1,
1778                       mbBuf.data(),
1779                       mbLen + 1,        // size in bytes, not length
1780                       NULL,
1781                       NULL
1782                    ) == 0 ||
1783                   strcmp(mbBuf, psz) != 0 )
1784             {
1785                 // we didn't obtain the same thing we started from, hence
1786                 // the conversion was lossy and we consider that it failed
1787                 return (size_t)-1;
1788             }
1789         }
1790
1791         // note that it returns count of written chars for buf != NULL and size
1792         // of the needed buffer for buf == NULL so in either case the length of
1793         // the string (which never includes the terminating NUL) is one less
1794         return len - 1;
1795     }
1796
1797     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1798     {
1799         /*
1800             we have a problem here: by default, WideCharToMultiByte() may
1801             replace characters unrepresentable in the target code page with bad
1802             quality approximations such as turning "1/2" symbol (U+00BD) into
1803             "1" for the code pages which don't have it and we, obviously, want
1804             to avoid this at any price
1805
1806             the trouble is that this function does it _silently_, i.e. it won't
1807             even tell us whether it did or not... Win98/2000 and higher provide
1808             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1809             we have to resort to a round trip, i.e. check that converting back
1810             results in the same string -- this is, of course, expensive but
1811             otherwise we simply can't be sure to not garble the data.
1812          */
1813
1814         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1815         // it doesn't work with CJK encodings (which we test for rather roughly
1816         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1817         // supporting it
1818         BOOL usedDef wxDUMMY_INITIALIZE(false);
1819         BOOL *pUsedDef;
1820         int flags;
1821         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1822         {
1823             // it's our lucky day
1824             flags = WC_NO_BEST_FIT_CHARS;
1825             pUsedDef = &usedDef;
1826         }
1827         else // old system or unsupported encoding
1828         {
1829             flags = 0;
1830             pUsedDef = NULL;
1831         }
1832
1833         const size_t len = ::WideCharToMultiByte
1834                              (
1835                                 m_CodePage,     // code page
1836                                 flags,          // either none or no best fit
1837                                 pwz,            // input string
1838                                 -1,             // it is (wide) NUL-terminated
1839                                 buf,            // output buffer
1840                                 buf ? n : 0,    // and its size
1841                                 NULL,           // default "replacement" char
1842                                 pUsedDef        // [out] was it used?
1843                              );
1844
1845         if ( !len )
1846         {
1847             // function totally failed
1848             return (size_t)-1;
1849         }
1850
1851         // if we were really converting, check if we succeeded
1852         if ( buf )
1853         {
1854             if ( flags )
1855             {
1856                 // check if the conversion failed, i.e. if any replacements
1857                 // were done
1858                 if ( usedDef )
1859                     return (size_t)-1;
1860             }
1861             else // we must resort to double tripping...
1862             {
1863                 wxWCharBuffer wcBuf(n);
1864                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1865                         wcscmp(wcBuf, pwz) != 0 )
1866                 {
1867                     // we didn't obtain the same thing we started from, hence
1868                     // the conversion was lossy and we consider that it failed
1869                     return (size_t)-1;
1870                 }
1871             }
1872         }
1873
1874         // see the comment above for the reason of "len - 1"
1875         return len - 1;
1876     }
1877
1878     bool IsOk() const { return m_CodePage != -1; }
1879
1880 private:
1881     static bool CanUseNoBestFit()
1882     {
1883         static int s_isWin98Or2k = -1;
1884
1885         if ( s_isWin98Or2k == -1 )
1886         {
1887             int verMaj, verMin;
1888             switch ( wxGetOsVersion(&verMaj, &verMin) )
1889             {
1890                 case wxWIN95:
1891                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1892                     break;
1893
1894                 case wxWINDOWS_NT:
1895                     s_isWin98Or2k = verMaj >= 5;
1896                     break;
1897
1898                 default:
1899                     // unknown, be conseravtive by default
1900                     s_isWin98Or2k = 0;
1901             }
1902
1903             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1904         }
1905
1906         return s_isWin98Or2k == 1;
1907     }
1908
1909     static bool IsAtLeastWin2kSP4()
1910     {
1911 #ifdef __WXWINCE__
1912         return false;
1913 #else
1914         static int s_isAtLeastWin2kSP4 = -1;
1915
1916         if ( s_isAtLeastWin2kSP4 == -1 )
1917         {
1918             OSVERSIONINFOEX ver;
1919
1920             memset(&ver, 0, sizeof(ver));
1921             ver.dwOSVersionInfoSize = sizeof(ver);
1922             GetVersionEx((OSVERSIONINFO*)&ver);
1923
1924             s_isAtLeastWin2kSP4 =
1925               ((ver.dwMajorVersion > 5) || // Vista+
1926                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
1927                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
1928                ver.wServicePackMajor >= 4)) // 2000 SP4+
1929               ? 1 : 0;
1930         }
1931
1932         return s_isAtLeastWin2kSP4 == 1;
1933 #endif
1934     }
1935
1936     virtual const char *GetMBNul(size_t *nulLen) const
1937     {
1938         if ( m_nulLen == (size_t)-2 )
1939         {
1940             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1941
1942             self->m_nulLen = ::WideCharToMultiByte
1943                                (
1944                                     m_CodePage,         // code page
1945                                     0,                  // no flags
1946                                     L"",                // input string
1947                                     1,                  // translate just NUL
1948                                     self->m_nulBuf,     // output buffer
1949                                     WXSIZEOF(m_nulBuf), // and its size
1950                                     NULL,               // "replacement" char
1951                                     NULL                // [out] was it used?
1952                                );
1953
1954             if ( m_nulLen == 0 )
1955                 self->m_nulLen = (size_t)-1;
1956         }
1957
1958         *nulLen = m_nulLen;
1959         return m_nulBuf;
1960     }
1961
1962     long m_CodePage;
1963     size_t m_nulLen;
1964     char m_nulBuf[8];
1965 };
1966
1967 #endif // wxHAVE_WIN32_MB2WC
1968
1969 // ============================================================================
1970 // Cocoa conversion classes
1971 // ============================================================================
1972
1973 #if defined(__WXCOCOA__)
1974
1975 // RN:  There is no UTF-32 support in either Core Foundation or
1976 // Cocoa.  Strangely enough, internally Core Foundation uses
1977 // UTF 32 internally quite a bit - its just not public (yet).
1978
1979 #include <CoreFoundation/CFString.h>
1980 #include <CoreFoundation/CFStringEncodingExt.h>
1981
1982 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1983 {
1984     CFStringEncoding enc = kCFStringEncodingInvalidId ;
1985     if ( encoding == wxFONTENCODING_DEFAULT )
1986     {
1987         enc = CFStringGetSystemEncoding();
1988     }
1989     else switch( encoding)
1990     {
1991         case wxFONTENCODING_ISO8859_1 :
1992             enc = kCFStringEncodingISOLatin1 ;
1993             break ;
1994         case wxFONTENCODING_ISO8859_2 :
1995             enc = kCFStringEncodingISOLatin2;
1996             break ;
1997         case wxFONTENCODING_ISO8859_3 :
1998             enc = kCFStringEncodingISOLatin3 ;
1999             break ;
2000         case wxFONTENCODING_ISO8859_4 :
2001             enc = kCFStringEncodingISOLatin4;
2002             break ;
2003         case wxFONTENCODING_ISO8859_5 :
2004             enc = kCFStringEncodingISOLatinCyrillic;
2005             break ;
2006         case wxFONTENCODING_ISO8859_6 :
2007             enc = kCFStringEncodingISOLatinArabic;
2008             break ;
2009         case wxFONTENCODING_ISO8859_7 :
2010             enc = kCFStringEncodingISOLatinGreek;
2011             break ;
2012         case wxFONTENCODING_ISO8859_8 :
2013             enc = kCFStringEncodingISOLatinHebrew;
2014             break ;
2015         case wxFONTENCODING_ISO8859_9 :
2016             enc = kCFStringEncodingISOLatin5;
2017             break ;
2018         case wxFONTENCODING_ISO8859_10 :
2019             enc = kCFStringEncodingISOLatin6;
2020             break ;
2021         case wxFONTENCODING_ISO8859_11 :
2022             enc = kCFStringEncodingISOLatinThai;
2023             break ;
2024         case wxFONTENCODING_ISO8859_13 :
2025             enc = kCFStringEncodingISOLatin7;
2026             break ;
2027         case wxFONTENCODING_ISO8859_14 :
2028             enc = kCFStringEncodingISOLatin8;
2029             break ;
2030         case wxFONTENCODING_ISO8859_15 :
2031             enc = kCFStringEncodingISOLatin9;
2032             break ;
2033
2034         case wxFONTENCODING_KOI8 :
2035             enc = kCFStringEncodingKOI8_R;
2036             break ;
2037         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2038             enc = kCFStringEncodingDOSRussian;
2039             break ;
2040
2041 //      case wxFONTENCODING_BULGARIAN :
2042 //          enc = ;
2043 //          break ;
2044
2045         case wxFONTENCODING_CP437 :
2046             enc =kCFStringEncodingDOSLatinUS ;
2047             break ;
2048         case wxFONTENCODING_CP850 :
2049             enc = kCFStringEncodingDOSLatin1;
2050             break ;
2051         case wxFONTENCODING_CP852 :
2052             enc = kCFStringEncodingDOSLatin2;
2053             break ;
2054         case wxFONTENCODING_CP855 :
2055             enc = kCFStringEncodingDOSCyrillic;
2056             break ;
2057         case wxFONTENCODING_CP866 :
2058             enc =kCFStringEncodingDOSRussian ;
2059             break ;
2060         case wxFONTENCODING_CP874 :
2061             enc = kCFStringEncodingDOSThai;
2062             break ;
2063         case wxFONTENCODING_CP932 :
2064             enc = kCFStringEncodingDOSJapanese;
2065             break ;
2066         case wxFONTENCODING_CP936 :
2067             enc =kCFStringEncodingDOSChineseSimplif ;
2068             break ;
2069         case wxFONTENCODING_CP949 :
2070             enc = kCFStringEncodingDOSKorean;
2071             break ;
2072         case wxFONTENCODING_CP950 :
2073             enc = kCFStringEncodingDOSChineseTrad;
2074             break ;
2075         case wxFONTENCODING_CP1250 :
2076             enc = kCFStringEncodingWindowsLatin2;
2077             break ;
2078         case wxFONTENCODING_CP1251 :
2079             enc =kCFStringEncodingWindowsCyrillic ;
2080             break ;
2081         case wxFONTENCODING_CP1252 :
2082             enc =kCFStringEncodingWindowsLatin1 ;
2083             break ;
2084         case wxFONTENCODING_CP1253 :
2085             enc = kCFStringEncodingWindowsGreek;
2086             break ;
2087         case wxFONTENCODING_CP1254 :
2088             enc = kCFStringEncodingWindowsLatin5;
2089             break ;
2090         case wxFONTENCODING_CP1255 :
2091             enc =kCFStringEncodingWindowsHebrew ;
2092             break ;
2093         case wxFONTENCODING_CP1256 :
2094             enc =kCFStringEncodingWindowsArabic ;
2095             break ;
2096         case wxFONTENCODING_CP1257 :
2097             enc = kCFStringEncodingWindowsBalticRim;
2098             break ;
2099 //   This only really encodes to UTF7 (if that) evidently
2100 //        case wxFONTENCODING_UTF7 :
2101 //            enc = kCFStringEncodingNonLossyASCII ;
2102 //            break ;
2103         case wxFONTENCODING_UTF8 :
2104             enc = kCFStringEncodingUTF8 ;
2105             break ;
2106         case wxFONTENCODING_EUC_JP :
2107             enc = kCFStringEncodingEUC_JP;
2108             break ;
2109         case wxFONTENCODING_UTF16 :
2110             enc = kCFStringEncodingUnicode ;
2111             break ;
2112         case wxFONTENCODING_MACROMAN :
2113             enc = kCFStringEncodingMacRoman ;
2114             break ;
2115         case wxFONTENCODING_MACJAPANESE :
2116             enc = kCFStringEncodingMacJapanese ;
2117             break ;
2118         case wxFONTENCODING_MACCHINESETRAD :
2119             enc = kCFStringEncodingMacChineseTrad ;
2120             break ;
2121         case wxFONTENCODING_MACKOREAN :
2122             enc = kCFStringEncodingMacKorean ;
2123             break ;
2124         case wxFONTENCODING_MACARABIC :
2125             enc = kCFStringEncodingMacArabic ;
2126             break ;
2127         case wxFONTENCODING_MACHEBREW :
2128             enc = kCFStringEncodingMacHebrew ;
2129             break ;
2130         case wxFONTENCODING_MACGREEK :
2131             enc = kCFStringEncodingMacGreek ;
2132             break ;
2133         case wxFONTENCODING_MACCYRILLIC :
2134             enc = kCFStringEncodingMacCyrillic ;
2135             break ;
2136         case wxFONTENCODING_MACDEVANAGARI :
2137             enc = kCFStringEncodingMacDevanagari ;
2138             break ;
2139         case wxFONTENCODING_MACGURMUKHI :
2140             enc = kCFStringEncodingMacGurmukhi ;
2141             break ;
2142         case wxFONTENCODING_MACGUJARATI :
2143             enc = kCFStringEncodingMacGujarati ;
2144             break ;
2145         case wxFONTENCODING_MACORIYA :
2146             enc = kCFStringEncodingMacOriya ;
2147             break ;
2148         case wxFONTENCODING_MACBENGALI :
2149             enc = kCFStringEncodingMacBengali ;
2150             break ;
2151         case wxFONTENCODING_MACTAMIL :
2152             enc = kCFStringEncodingMacTamil ;
2153             break ;
2154         case wxFONTENCODING_MACTELUGU :
2155             enc = kCFStringEncodingMacTelugu ;
2156             break ;
2157         case wxFONTENCODING_MACKANNADA :
2158             enc = kCFStringEncodingMacKannada ;
2159             break ;
2160         case wxFONTENCODING_MACMALAJALAM :
2161             enc = kCFStringEncodingMacMalayalam ;
2162             break ;
2163         case wxFONTENCODING_MACSINHALESE :
2164             enc = kCFStringEncodingMacSinhalese ;
2165             break ;
2166         case wxFONTENCODING_MACBURMESE :
2167             enc = kCFStringEncodingMacBurmese ;
2168             break ;
2169         case wxFONTENCODING_MACKHMER :
2170             enc = kCFStringEncodingMacKhmer ;
2171             break ;
2172         case wxFONTENCODING_MACTHAI :
2173             enc = kCFStringEncodingMacThai ;
2174             break ;
2175         case wxFONTENCODING_MACLAOTIAN :
2176             enc = kCFStringEncodingMacLaotian ;
2177             break ;
2178         case wxFONTENCODING_MACGEORGIAN :
2179             enc = kCFStringEncodingMacGeorgian ;
2180             break ;
2181         case wxFONTENCODING_MACARMENIAN :
2182             enc = kCFStringEncodingMacArmenian ;
2183             break ;
2184         case wxFONTENCODING_MACCHINESESIMP :
2185             enc = kCFStringEncodingMacChineseSimp ;
2186             break ;
2187         case wxFONTENCODING_MACTIBETAN :
2188             enc = kCFStringEncodingMacTibetan ;
2189             break ;
2190         case wxFONTENCODING_MACMONGOLIAN :
2191             enc = kCFStringEncodingMacMongolian ;
2192             break ;
2193         case wxFONTENCODING_MACETHIOPIC :
2194             enc = kCFStringEncodingMacEthiopic ;
2195             break ;
2196         case wxFONTENCODING_MACCENTRALEUR :
2197             enc = kCFStringEncodingMacCentralEurRoman ;
2198             break ;
2199         case wxFONTENCODING_MACVIATNAMESE :
2200             enc = kCFStringEncodingMacVietnamese ;
2201             break ;
2202         case wxFONTENCODING_MACARABICEXT :
2203             enc = kCFStringEncodingMacExtArabic ;
2204             break ;
2205         case wxFONTENCODING_MACSYMBOL :
2206             enc = kCFStringEncodingMacSymbol ;
2207             break ;
2208         case wxFONTENCODING_MACDINGBATS :
2209             enc = kCFStringEncodingMacDingbats ;
2210             break ;
2211         case wxFONTENCODING_MACTURKISH :
2212             enc = kCFStringEncodingMacTurkish ;
2213             break ;
2214         case wxFONTENCODING_MACCROATIAN :
2215             enc = kCFStringEncodingMacCroatian ;
2216             break ;
2217         case wxFONTENCODING_MACICELANDIC :
2218             enc = kCFStringEncodingMacIcelandic ;
2219             break ;
2220         case wxFONTENCODING_MACROMANIAN :
2221             enc = kCFStringEncodingMacRomanian ;
2222             break ;
2223         case wxFONTENCODING_MACCELTIC :
2224             enc = kCFStringEncodingMacCeltic ;
2225             break ;
2226         case wxFONTENCODING_MACGAELIC :
2227             enc = kCFStringEncodingMacGaelic ;
2228             break ;
2229 //      case wxFONTENCODING_MACKEYBOARD :
2230 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2231 //          break ;
2232         default :
2233             // because gcc is picky
2234             break ;
2235     } ;
2236     return enc ;
2237 }
2238
2239 class wxMBConv_cocoa : public wxMBConv
2240 {
2241 public:
2242     wxMBConv_cocoa()
2243     {
2244         Init(CFStringGetSystemEncoding()) ;
2245     }
2246
2247 #if wxUSE_FONTMAP
2248     wxMBConv_cocoa(const wxChar* name)
2249     {
2250         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2251     }
2252 #endif
2253
2254     wxMBConv_cocoa(wxFontEncoding encoding)
2255     {
2256         Init( wxCFStringEncFromFontEnc(encoding) );
2257     }
2258
2259     ~wxMBConv_cocoa()
2260     {
2261     }
2262
2263     void Init( CFStringEncoding encoding)
2264     {
2265         m_encoding = encoding ;
2266     }
2267
2268     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2269     {
2270         wxASSERT(szUnConv);
2271
2272         CFStringRef theString = CFStringCreateWithBytes (
2273                                                 NULL, //the allocator
2274                                                 (const UInt8*)szUnConv,
2275                                                 strlen(szUnConv),
2276                                                 m_encoding,
2277                                                 false //no BOM/external representation
2278                                                 );
2279
2280         wxASSERT(theString);
2281
2282         size_t nOutLength = CFStringGetLength(theString);
2283
2284         if (szOut == NULL)
2285         {
2286             CFRelease(theString);
2287             return nOutLength;
2288         }
2289
2290         CFRange theRange = { 0, nOutSize };
2291
2292 #if SIZEOF_WCHAR_T == 4
2293         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2294 #endif
2295
2296         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2297
2298         CFRelease(theString);
2299
2300         szUniCharBuffer[nOutLength] = '\0' ;
2301
2302 #if SIZEOF_WCHAR_T == 4
2303         wxMBConvUTF16 converter ;
2304         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2305         delete[] szUniCharBuffer;
2306 #endif
2307
2308         return nOutLength;
2309     }
2310
2311     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2312     {
2313         wxASSERT(szUnConv);
2314
2315         size_t nRealOutSize;
2316         size_t nBufSize = wxWcslen(szUnConv);
2317         UniChar* szUniBuffer = (UniChar*) szUnConv;
2318
2319 #if SIZEOF_WCHAR_T == 4
2320         wxMBConvUTF16 converter ;
2321         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2322         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2323         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2324         nBufSize /= sizeof(UniChar);
2325 #endif
2326
2327         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2328                                 NULL, //allocator
2329                                 szUniBuffer,
2330                                 nBufSize,
2331                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2332                             );
2333
2334         wxASSERT(theString);
2335
2336         //Note that CER puts a BOM when converting to unicode
2337         //so we  check and use getchars instead in that case
2338         if (m_encoding == kCFStringEncodingUnicode)
2339         {
2340             if (szOut != NULL)
2341                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2342
2343             nRealOutSize = CFStringGetLength(theString) + 1;
2344         }
2345         else
2346         {
2347             CFStringGetBytes(
2348                 theString,
2349                 CFRangeMake(0, CFStringGetLength(theString)),
2350                 m_encoding,
2351                 0, //what to put in characters that can't be converted -
2352                     //0 tells CFString to return NULL if it meets such a character
2353                 false, //not an external representation
2354                 (UInt8*) szOut,
2355                 nOutSize,
2356                 (CFIndex*) &nRealOutSize
2357                         );
2358         }
2359
2360         CFRelease(theString);
2361
2362 #if SIZEOF_WCHAR_T == 4
2363         delete[] szUniBuffer;
2364 #endif
2365
2366         return  nRealOutSize - 1;
2367     }
2368
2369     bool IsOk() const
2370     {
2371         return m_encoding != kCFStringEncodingInvalidId &&
2372               CFStringIsEncodingAvailable(m_encoding);
2373     }
2374
2375 private:
2376     CFStringEncoding m_encoding ;
2377 };
2378
2379 #endif // defined(__WXCOCOA__)
2380
2381 // ============================================================================
2382 // Mac conversion classes
2383 // ============================================================================
2384
2385 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2386
2387 class wxMBConv_mac : public wxMBConv
2388 {
2389 public:
2390     wxMBConv_mac()
2391     {
2392         Init(CFStringGetSystemEncoding()) ;
2393     }
2394
2395 #if wxUSE_FONTMAP
2396     wxMBConv_mac(const wxChar* name)
2397     {
2398         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2399     }
2400 #endif
2401
2402     wxMBConv_mac(wxFontEncoding encoding)
2403     {
2404         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2405     }
2406
2407     ~wxMBConv_mac()
2408     {
2409         OSStatus status = noErr ;
2410         status = TECDisposeConverter(m_MB2WC_converter);
2411         status = TECDisposeConverter(m_WC2MB_converter);
2412     }
2413
2414
2415     void Init( TextEncodingBase encoding)
2416     {
2417         OSStatus status = noErr ;
2418         m_char_encoding = encoding ;
2419         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2420
2421         status = TECCreateConverter(&m_MB2WC_converter,
2422                                     m_char_encoding,
2423                                     m_unicode_encoding);
2424         status = TECCreateConverter(&m_WC2MB_converter,
2425                                     m_unicode_encoding,
2426                                     m_char_encoding);
2427     }
2428
2429     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2430     {
2431         OSStatus status = noErr ;
2432         ByteCount byteOutLen ;
2433         ByteCount byteInLen = strlen(psz) ;
2434         wchar_t *tbuf = NULL ;
2435         UniChar* ubuf = NULL ;
2436         size_t res = 0 ;
2437
2438         if (buf == NULL)
2439         {
2440             //apple specs say at least 32
2441             n = wxMax( 32 , byteInLen ) ;
2442             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2443         }
2444         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2445 #if SIZEOF_WCHAR_T == 4
2446         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2447 #else
2448         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2449 #endif
2450         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2451           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2452 #if SIZEOF_WCHAR_T == 4
2453         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2454         // is not properly terminated we get random characters at the end
2455         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2456         wxMBConvUTF16 converter ;
2457         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2458         free( ubuf ) ;
2459 #else
2460         res = byteOutLen / sizeof( UniChar ) ;
2461 #endif
2462         if ( buf == NULL )
2463              free(tbuf) ;
2464
2465         if ( buf  && res < n)
2466             buf[res] = 0;
2467
2468         return res ;
2469     }
2470
2471     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2472     {
2473         OSStatus status = noErr ;
2474         ByteCount byteOutLen ;
2475         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2476
2477         char *tbuf = NULL ;
2478
2479         if (buf == NULL)
2480         {
2481             //apple specs say at least 32
2482             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2483             tbuf = (char*) malloc( n ) ;
2484         }
2485
2486         ByteCount byteBufferLen = n ;
2487         UniChar* ubuf = NULL ;
2488 #if SIZEOF_WCHAR_T == 4
2489         wxMBConvUTF16 converter ;
2490         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2491         byteInLen = unicharlen ;
2492         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2493         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2494 #else
2495         ubuf = (UniChar*) psz ;
2496 #endif
2497         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2498             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2499 #if SIZEOF_WCHAR_T == 4
2500         free( ubuf ) ;
2501 #endif
2502         if ( buf == NULL )
2503             free(tbuf) ;
2504
2505         size_t res = byteOutLen ;
2506         if ( buf  && res < n)
2507         {
2508             buf[res] = 0;
2509
2510             //we need to double-trip to verify it didn't insert any ? in place
2511             //of bogus characters
2512             wxWCharBuffer wcBuf(n);
2513             size_t pszlen = wxWcslen(psz);
2514             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2515                         wxWcslen(wcBuf) != pszlen ||
2516                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2517             {
2518                 // we didn't obtain the same thing we started from, hence
2519                 // the conversion was lossy and we consider that it failed
2520                 return (size_t)-1;
2521             }
2522         }
2523
2524         return res ;
2525     }
2526
2527     bool IsOk() const
2528         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2529
2530 private:
2531     TECObjectRef m_MB2WC_converter ;
2532     TECObjectRef m_WC2MB_converter ;
2533
2534     TextEncodingBase m_char_encoding ;
2535     TextEncodingBase m_unicode_encoding ;
2536 };
2537
2538 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2539
2540 // ============================================================================
2541 // wxEncodingConverter based conversion classes
2542 // ============================================================================
2543
2544 #if wxUSE_FONTMAP
2545
2546 class wxMBConv_wxwin : public wxMBConv
2547 {
2548 private:
2549     void Init()
2550     {
2551         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2552                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2553     }
2554
2555 public:
2556     // temporarily just use wxEncodingConverter stuff,
2557     // so that it works while a better implementation is built
2558     wxMBConv_wxwin(const wxChar* name)
2559     {
2560         if (name)
2561             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2562         else
2563             m_enc = wxFONTENCODING_SYSTEM;
2564
2565         Init();
2566     }
2567
2568     wxMBConv_wxwin(wxFontEncoding enc)
2569     {
2570         m_enc = enc;
2571
2572         Init();
2573     }
2574
2575     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2576     {
2577         size_t inbuf = strlen(psz);
2578         if (buf)
2579         {
2580             if (!m2w.Convert(psz,buf))
2581                 return (size_t)-1;
2582         }
2583         return inbuf;
2584     }
2585
2586     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2587     {
2588         const size_t inbuf = wxWcslen(psz);
2589         if (buf)
2590         {
2591             if (!w2m.Convert(psz,buf))
2592                 return (size_t)-1;
2593         }
2594
2595         return inbuf;
2596     }
2597
2598     bool IsOk() const { return m_ok; }
2599
2600 public:
2601     wxFontEncoding m_enc;
2602     wxEncodingConverter m2w, w2m;
2603
2604 private:
2605     virtual const char *GetMBNul(size_t *nulLen) const
2606     {
2607         switch ( m_enc )
2608         {
2609             case wxFONTENCODING_UTF16BE:
2610             case wxFONTENCODING_UTF16LE:
2611                 *nulLen = 2;
2612                 return "\0";
2613
2614             case wxFONTENCODING_UTF32BE:
2615             case wxFONTENCODING_UTF32LE:
2616                 *nulLen = 4;
2617                 return "\0\0\0";
2618
2619             default:
2620                 *nulLen = 1;
2621                 return "";
2622         }
2623     }
2624
2625     // were we initialized successfully?
2626     bool m_ok;
2627
2628     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2629 };
2630
2631 // make the constructors available for unit testing
2632 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2633 {
2634     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2635     if ( !result->IsOk() )
2636     {
2637         delete result;
2638         return 0;
2639     }
2640     return result;
2641 }
2642
2643 #endif // wxUSE_FONTMAP
2644
2645 // ============================================================================
2646 // wxCSConv implementation
2647 // ============================================================================
2648
2649 void wxCSConv::Init()
2650 {
2651     m_name = NULL;
2652     m_convReal =  NULL;
2653     m_deferred = true;
2654 }
2655
2656 wxCSConv::wxCSConv(const wxChar *charset)
2657 {
2658     Init();
2659
2660     if ( charset )
2661     {
2662         SetName(charset);
2663     }
2664
2665 #if wxUSE_FONTMAP
2666     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2667 #else
2668     m_encoding = wxFONTENCODING_SYSTEM;
2669 #endif
2670 }
2671
2672 wxCSConv::wxCSConv(wxFontEncoding encoding)
2673 {
2674     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2675     {
2676         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2677
2678         encoding = wxFONTENCODING_SYSTEM;
2679     }
2680
2681     Init();
2682
2683     m_encoding = encoding;
2684 }
2685
2686 wxCSConv::~wxCSConv()
2687 {
2688     Clear();
2689 }
2690
2691 wxCSConv::wxCSConv(const wxCSConv& conv)
2692         : wxMBConv()
2693 {
2694     Init();
2695
2696     SetName(conv.m_name);
2697     m_encoding = conv.m_encoding;
2698 }
2699
2700 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2701 {
2702     Clear();
2703
2704     SetName(conv.m_name);
2705     m_encoding = conv.m_encoding;
2706
2707     return *this;
2708 }
2709
2710 void wxCSConv::Clear()
2711 {
2712     free(m_name);
2713     delete m_convReal;
2714
2715     m_name = NULL;
2716     m_convReal = NULL;
2717 }
2718
2719 void wxCSConv::SetName(const wxChar *charset)
2720 {
2721     if (charset)
2722     {
2723         m_name = wxStrdup(charset);
2724         m_deferred = true;
2725     }
2726 }
2727
2728 #if wxUSE_FONTMAP
2729 #include "wx/hashmap.h"
2730
2731 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2732                      wxEncodingNameCache );
2733
2734 static wxEncodingNameCache gs_nameCache;
2735 #endif
2736
2737 wxMBConv *wxCSConv::DoCreate() const
2738 {
2739 #if wxUSE_FONTMAP
2740     wxLogTrace(TRACE_STRCONV,
2741                wxT("creating conversion for %s"),
2742                (m_name ? m_name
2743                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2744 #endif // wxUSE_FONTMAP
2745
2746     // check for the special case of ASCII or ISO8859-1 charset: as we have
2747     // special knowledge of it anyhow, we don't need to create a special
2748     // conversion object
2749     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2750             m_encoding == wxFONTENCODING_DEFAULT )
2751     {
2752         // don't convert at all
2753         return NULL;
2754     }
2755
2756     // we trust OS to do conversion better than we can so try external
2757     // conversion methods first
2758     //
2759     // the full order is:
2760     //      1. OS conversion (iconv() under Unix or Win32 API)
2761     //      2. hard coded conversions for UTF
2762     //      3. wxEncodingConverter as fall back
2763
2764     // step (1)
2765 #ifdef HAVE_ICONV
2766 #if !wxUSE_FONTMAP
2767     if ( m_name )
2768 #endif // !wxUSE_FONTMAP
2769     {
2770         wxString name(m_name);
2771         wxFontEncoding encoding(m_encoding);
2772
2773         if ( !name.empty() )
2774         {
2775             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2776             if ( conv->IsOk() )
2777                 return conv;
2778
2779             delete conv;
2780
2781 #if wxUSE_FONTMAP
2782             encoding =
2783                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2784 #endif // wxUSE_FONTMAP
2785         }
2786 #if wxUSE_FONTMAP
2787         {
2788             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2789             if ( it != gs_nameCache.end() )
2790             {
2791                 if ( it->second.empty() )
2792                     return NULL;
2793
2794                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2795                 if ( conv->IsOk() )
2796                     return conv;
2797
2798                 delete conv;
2799             }
2800
2801             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2802
2803             for ( ; *names; ++names )
2804             {
2805                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2806                 if ( conv->IsOk() )
2807                 {
2808                     gs_nameCache[encoding] = *names;
2809                     return conv;
2810                 }
2811
2812                 delete conv;
2813             }
2814
2815             gs_nameCache[encoding] = _T(""); // cache the failure
2816         }
2817 #endif // wxUSE_FONTMAP
2818     }
2819 #endif // HAVE_ICONV
2820
2821 #ifdef wxHAVE_WIN32_MB2WC
2822     {
2823 #if wxUSE_FONTMAP
2824         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2825                                       : new wxMBConv_win32(m_encoding);
2826         if ( conv->IsOk() )
2827             return conv;
2828
2829         delete conv;
2830 #else
2831         return NULL;
2832 #endif
2833     }
2834 #endif // wxHAVE_WIN32_MB2WC
2835 #if defined(__WXMAC__)
2836     {
2837         // leave UTF16 and UTF32 to the built-ins of wx
2838         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2839             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2840         {
2841
2842 #if wxUSE_FONTMAP
2843             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2844                                         : new wxMBConv_mac(m_encoding);
2845 #else
2846             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2847 #endif
2848             if ( conv->IsOk() )
2849                  return conv;
2850
2851             delete conv;
2852         }
2853     }
2854 #endif
2855 #if defined(__WXCOCOA__)
2856     {
2857         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2858         {
2859
2860 #if wxUSE_FONTMAP
2861             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2862                                           : new wxMBConv_cocoa(m_encoding);
2863 #else
2864             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2865 #endif
2866             if ( conv->IsOk() )
2867                  return conv;
2868
2869             delete conv;
2870         }
2871     }
2872 #endif
2873     // step (2)
2874     wxFontEncoding enc = m_encoding;
2875 #if wxUSE_FONTMAP
2876     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2877     {
2878         // use "false" to suppress interactive dialogs -- we can be called from
2879         // anywhere and popping up a dialog from here is the last thing we want to
2880         // do
2881         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2882     }
2883 #endif // wxUSE_FONTMAP
2884
2885     switch ( enc )
2886     {
2887         case wxFONTENCODING_UTF7:
2888              return new wxMBConvUTF7;
2889
2890         case wxFONTENCODING_UTF8:
2891              return new wxMBConvUTF8;
2892
2893         case wxFONTENCODING_UTF16BE:
2894              return new wxMBConvUTF16BE;
2895
2896         case wxFONTENCODING_UTF16LE:
2897              return new wxMBConvUTF16LE;
2898
2899         case wxFONTENCODING_UTF32BE:
2900              return new wxMBConvUTF32BE;
2901
2902         case wxFONTENCODING_UTF32LE:
2903              return new wxMBConvUTF32LE;
2904
2905         default:
2906              // nothing to do but put here to suppress gcc warnings
2907              ;
2908     }
2909
2910     // step (3)
2911 #if wxUSE_FONTMAP
2912     {
2913         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2914                                       : new wxMBConv_wxwin(m_encoding);
2915         if ( conv->IsOk() )
2916             return conv;
2917
2918         delete conv;
2919     }
2920 #endif // wxUSE_FONTMAP
2921
2922     // NB: This is a hack to prevent deadlock. What could otherwise happen
2923     //     in Unicode build: wxConvLocal creation ends up being here
2924     //     because of some failure and logs the error. But wxLog will try to
2925     //     attach timestamp, for which it will need wxConvLocal (to convert
2926     //     time to char* and then wchar_t*), but that fails, tries to log
2927     //     error, but wxLog has a (already locked) critical section that
2928     //     guards static buffer.
2929     static bool alreadyLoggingError = false;
2930     if (!alreadyLoggingError)
2931     {
2932         alreadyLoggingError = true;
2933         wxLogError(_("Cannot convert from the charset '%s'!"),
2934                    m_name ? m_name
2935                       :
2936 #if wxUSE_FONTMAP
2937                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2938 #else // !wxUSE_FONTMAP
2939                          wxString::Format(_("encoding %s"), m_encoding).c_str()
2940 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2941               );
2942         alreadyLoggingError = false;
2943     }
2944
2945     return NULL;
2946 }
2947
2948 void wxCSConv::CreateConvIfNeeded() const
2949 {
2950     if ( m_deferred )
2951     {
2952         wxCSConv *self = (wxCSConv *)this; // const_cast
2953
2954 #if wxUSE_INTL
2955         // if we don't have neither the name nor the encoding, use the default
2956         // encoding for this system
2957         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2958         {
2959             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2960         }
2961 #endif // wxUSE_INTL
2962
2963         self->m_convReal = DoCreate();
2964         self->m_deferred = false;
2965     }
2966 }
2967
2968 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2969 {
2970     CreateConvIfNeeded();
2971
2972     if (m_convReal)
2973         return m_convReal->MB2WC(buf, psz, n);
2974
2975     // latin-1 (direct)
2976     size_t len = strlen(psz);
2977
2978     if (buf)
2979     {
2980         for (size_t c = 0; c <= len; c++)
2981             buf[c] = (unsigned char)(psz[c]);
2982     }
2983
2984     return len;
2985 }
2986
2987 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2988 {
2989     CreateConvIfNeeded();
2990
2991     if (m_convReal)
2992         return m_convReal->WC2MB(buf, psz, n);
2993
2994     // latin-1 (direct)
2995     const size_t len = wxWcslen(psz);
2996     if (buf)
2997     {
2998         for (size_t c = 0; c <= len; c++)
2999         {
3000             if (psz[c] > 0xFF)
3001                 return (size_t)-1;
3002             buf[c] = (char)psz[c];
3003         }
3004     }
3005     else
3006     {
3007         for (size_t c = 0; c <= len; c++)
3008         {
3009             if (psz[c] > 0xFF)
3010                 return (size_t)-1;
3011         }
3012     }
3013
3014     return len;
3015 }
3016
3017 const char *wxCSConv::GetMBNul(size_t *nulLen) const
3018 {
3019     CreateConvIfNeeded();
3020
3021     if ( m_convReal )
3022     {
3023         // cast needed just to call private function of m_convReal
3024         return ((wxCSConv *)m_convReal)->GetMBNul(nulLen);
3025     }
3026
3027     *nulLen = 1;
3028     return "";
3029 }
3030
3031 // ----------------------------------------------------------------------------
3032 // globals
3033 // ----------------------------------------------------------------------------
3034
3035 #ifdef __WINDOWS__
3036     static wxMBConv_win32 wxConvLibcObj;
3037 #elif defined(__WXMAC__) && !defined(__MACH__)
3038     static wxMBConv_mac wxConvLibcObj ;
3039 #else
3040     static wxMBConvLibc wxConvLibcObj;
3041 #endif
3042
3043 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3044 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3045 static wxMBConvUTF7 wxConvUTF7Obj;
3046 static wxMBConvUTF8 wxConvUTF8Obj;
3047
3048 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3049 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3050 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3051 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3052 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3053 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3054 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3055 #ifdef __WXOSX__
3056                                     wxConvUTF8Obj;
3057 #else
3058                                     wxConvLibcObj;
3059 #endif
3060
3061
3062 #else // !wxUSE_WCHAR_T
3063
3064 // stand-ins in absence of wchar_t
3065 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3066                                 wxConvISO8859_1,
3067                                 wxConvLocal,
3068                                 wxConvUTF8;
3069
3070 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T