src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  90 static bool NotAllNULs(const char *p, size_t n)
  91 {
  92     while ( n && *p++ == '\0' )
  93         n--;
  94
  95     return n != 0;
  96 }
  97
  98 // ----------------------------------------------------------------------------
  99 // UTF-16 en/decoding to/from UCS-4
 100 // ----------------------------------------------------------------------------
 101
 102
 103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 104 {
 105     if (input<=0xffff)
 106     {
 107         if (output)
 108             *output = (wxUint16) input;
 109         return 1;
 110     }
 111     else if (input>=0x110000)
 112     {
 113         return (size_t)-1;
 114     }
 115     else
 116     {
 117         if (output)
 118         {
 119             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 120             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 121         }
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input<0xd800) || (*input>0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 134     {
 135         output = *input;
 136         return (size_t)-1;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145
 146 // ----------------------------------------------------------------------------
 147 // wxMBConv
 148 // ----------------------------------------------------------------------------
 149
 150 wxMBConv::~wxMBConv()
 151 {
 152     // nothing to do here (necessary for Darwin linking probably)
 153 }
 154
 155 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 156 {
 157     if ( psz )
 158     {
 159         // calculate the length of the buffer needed first
 160         size_t nLen = MB2WC(NULL, psz, 0);
 161         if ( nLen != (size_t)-1 )
 162         {
 163             // now do the actual conversion
 164             wxWCharBuffer buf(nLen);
 165             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 166             if ( nLen != (size_t)-1 )
 167             {
 168                 return buf;
 169             }
 170         }
 171     }
 172
 173     wxWCharBuffer buf((wchar_t *)NULL);
 174
 175     return buf;
 176 }
 177
 178 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 179 {
 180     if ( pwz )
 181     {
 182         size_t nLen = WC2MB(NULL, pwz, 0);
 183         if ( nLen != (size_t)-1 )
 184         {
 185             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 186             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 187             if ( nLen != (size_t)-1 )
 188             {
 189                 return buf;
 190             }
 191         }
 192     }
 193
 194     wxCharBuffer buf((char *)NULL);
 195
 196     return buf;
 197 }
 198
 199 const wxWCharBuffer
 200 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 201 {
 202     // the currently accumulated wide characters
 203     wxWCharBuffer wbuf;
 204
 205     // the current length of wbuf
 206     size_t lenBuf = 0;
 207
 208     // the number of NULs terminating this string
 209     size_t nulLen   wxDUMMY_INITIALIZE(0);
 210
 211     // make a copy of the input string unless it is already properly
 212     // NUL-terminated
 213     wxCharBuffer bufTmp;
 214
 215     // if we were not given the input size we just have to assume that the
 216     // string is properly terminated as we have no way of knowing how long it
 217     // is anyhow, but if we do have the size check whether there are enough
 218     // NULs at the end
 219     if ( inLen != (size_t)-1 )
 220     {
 221         // we need to know how to find the end of this string
 222         nulLen = GetMBNulLen();
 223         if ( nulLen == (size_t)-1 )
 224             return wbuf;
 225
 226         // if there are enough NULs we can avoid the copy
 227         if ( inLen < nulLen || NotAllNULs(in + inLen - nulLen, nulLen) )
 228         {
 229             // make a copy in order to properly NUL-terminate the string
 230             bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
 231             char * const p = bufTmp.data();
 232             memcpy(p, in, inLen);
 233             for ( char *s = p + inLen; s < p + inLen + nulLen; s++ )
 234                 *s = '\0';
 235         }
 236     }
 237
 238     if ( bufTmp )
 239         in = bufTmp;
 240
 241     size_t lenChunk;
 242     for ( const char * const inEnd = in + inLen;; )
 243     {
 244         // try to convert the current chunk
 245         lenChunk = MB2WC(NULL, in, 0);
 246         if ( lenChunk == 0 )
 247         {
 248             // nothing left in the input string, conversion succeeded
 249             break;
 250         }
 251
 252         if ( lenChunk == (size_t)-1 )
 253             break;
 254
 255         // if we already have a previous chunk, leave the NUL separating it
 256         // from this one
 257         if ( lenBuf )
 258             lenBuf++;
 259
 260         const size_t lenBufNew = lenBuf + lenChunk;
 261         if ( !wbuf.extend(lenBufNew) )
 262         {
 263             lenChunk = (size_t)-1;
 264             break;
 265         }
 266
 267         lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 268         if ( lenChunk == (size_t)-1 )
 269             break;
 270
 271         lenBuf = lenBufNew;
 272
 273         if ( inLen == (size_t)-1 )
 274         {
 275             // convert only one chunk in this case, as we suppose that the
 276             // string is NUL-terminated and so inEnd is not used at all
 277             break;
 278         }
 279
 280         // advance the input pointer past the end of this chunk
 281         while ( NotAllNULs(in, nulLen) )
 282         {
 283             // notice that we must skip over multiple bytes here as we suppose
 284             // that if NUL takes 2 or 4 bytes, then all the other characters do
 285             // too and so if advanced by a single byte we might erroneously
 286             // detect sequences of NUL bytes in the middle of the input
 287             in += nulLen;
 288         }
 289
 290         in += nulLen; // skipping over its terminator as well
 291
 292         // note that ">=" (and not just "==") is needed here as the terminator
 293         // we skipped just above could be inside or just after the buffer
 294         // delimited by inEnd
 295         if ( in >= inEnd )
 296             break;
 297     }
 298
 299     if ( lenChunk == (size_t)-1 )
 300     {
 301         // conversion failed
 302         lenBuf = 0;
 303         wbuf.reset();
 304     }
 305
 306     if ( outLen )
 307         *outLen = lenBuf;
 308
 309     return wbuf;
 310 }
 311
 312 const wxCharBuffer
 313 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 314 {
 315     // the currently accumulated multibyte characters
 316     wxCharBuffer buf;
 317
 318     // the current length of buf
 319     size_t lenBuf = 0;
 320
 321     // make a copy of the input string unless it is already properly
 322     // NUL-terminated
 323     //
 324     // if we don't know its length we have no choice but to assume that it is,
 325     // indeed, properly terminated
 326     wxWCharBuffer bufTmp;
 327     if ( inLen == (size_t)-1 )
 328     {
 329         inLen = wxWcslen(in) + 1;
 330     }
 331     else if ( inLen != 0 && in[inLen - 1] != L'\0' )
 332     {
 333         // make a copy in order to properly NUL-terminate the string
 334         bufTmp = wxWCharBuffer(inLen);
 335         memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t));
 336     }
 337
 338     if ( bufTmp )
 339         in = bufTmp;
 340
 341     for ( const wchar_t * const inEnd = in + inLen;; )
 342     {
 343         // try to convert the current chunk, if anything left
 344         size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0;
 345         if ( lenChunk == 0 )
 346         {
 347             // nothing left in the input string, conversion succeeded
 348             if ( outLen )
 349                 *outLen = lenBuf ? lenBuf - 1 : lenBuf;
 350
 351             return buf;
 352         }
 353
 354         if ( lenChunk == (size_t)-1 )
 355             break;
 356
 357         const size_t lenBufNew = lenBuf + lenChunk;
 358         if ( !buf.extend(lenBufNew) )
 359             break;
 360
 361         lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 362         if ( lenChunk == (size_t)-1 )
 363             break;
 364
 365         // chunk successfully converted, go to the next one
 366         in += wxWcslen(in) + 1 /* skip NUL too */;
 367         lenBuf = lenBufNew + 1;
 368     }
 369
 370     // conversion failed
 371     if ( outLen )
 372         *outLen = 0;
 373
 374     return wxCharBuffer();
 375 }
 376
 377 // ----------------------------------------------------------------------------
 378 // wxMBConvLibc
 379 // ----------------------------------------------------------------------------
 380
 381 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 382 {
 383     return wxMB2WC(buf, psz, n);
 384 }
 385
 386 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 387 {
 388     return wxWC2MB(buf, psz, n);
 389 }
 390
 391 // ----------------------------------------------------------------------------
 392 // wxConvBrokenFileNames
 393 // ----------------------------------------------------------------------------
 394
 395 #ifdef __UNIX__
 396
 397 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 398 {
 399     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 400                   || wxStricmp(charset, _T("UTF8")) == 0  )
 401         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 402     else
 403         m_conv = new wxCSConv(charset);
 404 }
 405
 406 #endif // __UNIX__
 407
 408 // ----------------------------------------------------------------------------
 409 // UTF-7
 410 // ----------------------------------------------------------------------------
 411
 412 // Implementation (C) 2004 Fredrik Roubert
 413
 414 //
 415 // BASE64 decoding table
 416 //
 417 static const unsigned char utf7unb64[] =
 418 {
 419     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 420     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 421     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 422     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 423     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 424     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 425     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 426     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 427     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 428     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 429     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 430     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 431     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 432     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 433     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 434     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 435     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 436     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 437     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 438     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 439     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 440     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 441     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 442     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 443     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 444     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 445     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 446     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 447     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 448     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 449     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 450     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 451 };
 452
 453 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 454 {
 455     size_t len = 0;
 456
 457     while ( *psz && (!buf || (len < n)) )
 458     {
 459         unsigned char cc = *psz++;
 460         if (cc != '+')
 461         {
 462             // plain ASCII char
 463             if (buf)
 464                 *buf++ = cc;
 465             len++;
 466         }
 467         else if (*psz == '-')
 468         {
 469             // encoded plus sign
 470             if (buf)
 471                 *buf++ = cc;
 472             len++;
 473             psz++;
 474         }
 475         else // start of BASE64 encoded string
 476         {
 477             bool lsb, ok;
 478             unsigned int d, l;
 479             for ( ok = lsb = false, d = 0, l = 0;
 480                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 481                   psz++ )
 482             {
 483                 d <<= 6;
 484                 d += cc;
 485                 for (l += 6; l >= 8; lsb = !lsb)
 486                 {
 487                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 488                     if (lsb)
 489                     {
 490                         if (buf)
 491                             *buf++ |= c;
 492                         len ++;
 493                     }
 494                     else
 495                     {
 496                         if (buf)
 497                             *buf = (wchar_t)(c << 8);
 498                     }
 499
 500                     ok = true;
 501                 }
 502             }
 503
 504             if ( !ok )
 505             {
 506                 // in valid UTF7 we should have valid characters after '+'
 507                 return (size_t)-1;
 508             }
 509
 510             if (*psz == '-')
 511                 psz++;
 512         }
 513     }
 514
 515     if ( buf && (len < n) )
 516         *buf = '\0';
 517
 518     return len;
 519 }
 520
 521 //
 522 // BASE64 encoding table
 523 //
 524 static const unsigned char utf7enb64[] =
 525 {
 526     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 527     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 528     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 529     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 530     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 531     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 532     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 533     '4', '5', '6', '7', '8', '9', '+', '/'
 534 };
 535
 536 //
 537 // UTF-7 encoding table
 538 //
 539 // 0 - Set D (directly encoded characters)
 540 // 1 - Set O (optional direct characters)
 541 // 2 - whitespace characters (optional)
 542 // 3 - special characters
 543 //
 544 static const unsigned char utf7encode[128] =
 545 {
 546     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 547     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 548     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 549     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 550     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 551     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 552     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 553     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 554 };
 555
 556 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 557 {
 558     size_t len = 0;
 559
 560     while (*psz && ((!buf) || (len < n)))
 561     {
 562         wchar_t cc = *psz++;
 563         if (cc < 0x80 && utf7encode[cc] < 1)
 564         {
 565             // plain ASCII char
 566             if (buf)
 567                 *buf++ = (char)cc;
 568             len++;
 569         }
 570 #ifndef WC_UTF16
 571         else if (((wxUint32)cc) > 0xffff)
 572         {
 573             // no surrogate pair generation (yet?)
 574             return (size_t)-1;
 575         }
 576 #endif
 577         else
 578         {
 579             if (buf)
 580                 *buf++ = '+';
 581             len++;
 582             if (cc != '+')
 583             {
 584                 // BASE64 encode string
 585                 unsigned int lsb, d, l;
 586                 for (d = 0, l = 0; /*nothing*/; psz++)
 587                 {
 588                     for (lsb = 0; lsb < 2; lsb ++)
 589                     {
 590                         d <<= 8;
 591                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 592
 593                         for (l += 8; l >= 6; )
 594                         {
 595                             l -= 6;
 596                             if (buf)
 597                                 *buf++ = utf7enb64[(d >> l) % 64];
 598                             len++;
 599                         }
 600                     }
 601                     cc = *psz;
 602                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 603                         break;
 604                 }
 605                 if (l != 0)
 606                 {
 607                     if (buf)
 608                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 609                     len++;
 610                 }
 611             }
 612             if (buf)
 613                 *buf++ = '-';
 614             len++;
 615         }
 616     }
 617     if (buf && (len < n))
 618         *buf = 0;
 619     return len;
 620 }
 621
 622 // ----------------------------------------------------------------------------
 623 // UTF-8
 624 // ----------------------------------------------------------------------------
 625
 626 static wxUint32 utf8_max[]=
 627     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 628
 629 // boundaries of the private use area we use to (temporarily) remap invalid
 630 // characters invalid in a UTF-8 encoded string
 631 const wxUint32 wxUnicodePUA = 0x100000;
 632 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 633
 634 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 635 {
 636     size_t len = 0;
 637
 638     while (*psz && ((!buf) || (len < n)))
 639     {
 640         const char *opsz = psz;
 641         bool invalid = false;
 642         unsigned char cc = *psz++, fc = cc;
 643         unsigned cnt;
 644         for (cnt = 0; fc & 0x80; cnt++)
 645             fc <<= 1;
 646         if (!cnt)
 647         {
 648             // plain ASCII char
 649             if (buf)
 650                 *buf++ = cc;
 651             len++;
 652
 653             // escape the escape character for octal escapes
 654             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 655                     && cc == '\\' && (!buf || len < n))
 656             {
 657                 if (buf)
 658                     *buf++ = cc;
 659                 len++;
 660             }
 661         }
 662         else
 663         {
 664             cnt--;
 665             if (!cnt)
 666             {
 667                 // invalid UTF-8 sequence
 668                 invalid = true;
 669             }
 670             else
 671             {
 672                 unsigned ocnt = cnt - 1;
 673                 wxUint32 res = cc & (0x3f >> cnt);
 674                 while (cnt--)
 675                 {
 676                     cc = *psz;
 677                     if ((cc & 0xC0) != 0x80)
 678                     {
 679                         // invalid UTF-8 sequence
 680                         invalid = true;
 681                         break;
 682                     }
 683                     psz++;
 684                     res = (res << 6) | (cc & 0x3f);
 685                 }
 686                 if (invalid || res <= utf8_max[ocnt])
 687                 {
 688                     // illegal UTF-8 encoding
 689                     invalid = true;
 690                 }
 691                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 692                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 693                 {
 694                     // if one of our PUA characters turns up externally
 695                     // it must also be treated as an illegal sequence
 696                     // (a bit like you have to escape an escape character)
 697                     invalid = true;
 698                 }
 699                 else
 700                 {
 701 #ifdef WC_UTF16
 702                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 703                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 704                     if (pa == (size_t)-1)
 705                     {
 706                         invalid = true;
 707                     }
 708                     else
 709                     {
 710                         if (buf)
 711                             buf += pa;
 712                         len += pa;
 713                     }
 714 #else // !WC_UTF16
 715                     if (buf)
 716                         *buf++ = (wchar_t)res;
 717                     len++;
 718 #endif // WC_UTF16/!WC_UTF16
 719                 }
 720             }
 721             if (invalid)
 722             {
 723                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 724                 {
 725                     while (opsz < psz && (!buf || len < n))
 726                     {
 727 #ifdef WC_UTF16
 728                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 729                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 730                         wxASSERT(pa != (size_t)-1);
 731                         if (buf)
 732                             buf += pa;
 733                         opsz++;
 734                         len += pa;
 735 #else
 736                         if (buf)
 737                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 738                         opsz++;
 739                         len++;
 740 #endif
 741                     }
 742                 }
 743                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 744                 {
 745                     while (opsz < psz && (!buf || len < n))
 746                     {
 747                         if ( buf && len + 3 < n )
 748                         {
 749                             unsigned char on = *opsz;
 750                             *buf++ = L'\\';
 751                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 752                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 753                             *buf++ = (wchar_t)( L'0' + on % 010 );
 754                         }
 755                         opsz++;
 756                         len += 4;
 757                     }
 758                 }
 759                 else // MAP_INVALID_UTF8_NOT
 760                 {
 761                     return (size_t)-1;
 762                 }
 763             }
 764         }
 765     }
 766     if (buf && (len < n))
 767         *buf = 0;
 768     return len;
 769 }
 770
 771 static inline bool isoctal(wchar_t wch)
 772 {
 773     return L'0' <= wch && wch <= L'7';
 774 }
 775
 776 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 777 {
 778     size_t len = 0;
 779
 780     while (*psz && ((!buf) || (len < n)))
 781     {
 782         wxUint32 cc;
 783 #ifdef WC_UTF16
 784         // cast is ok for WC_UTF16
 785         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 786         psz += (pa == (size_t)-1) ? 1 : pa;
 787 #else
 788         cc=(*psz++) & 0x7fffffff;
 789 #endif
 790
 791         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 792                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 793         {
 794             if (buf)
 795                 *buf++ = (char)(cc - wxUnicodePUA);
 796             len++;
 797         }
 798         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 799                     && cc == L'\\' && psz[0] == L'\\' )
 800         {
 801             if (buf)
 802                 *buf++ = (char)cc;
 803             psz++;
 804             len++;
 805         }
 806         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 807                     cc == L'\\' &&
 808                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 809         {
 810             if (buf)
 811             {
 812                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 813                                  (psz[1] - L'0')*010 +
 814                                  (psz[2] - L'0'));
 815             }
 816
 817             psz += 3;
 818             len++;
 819         }
 820         else
 821         {
 822             unsigned cnt;
 823             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 824             if (!cnt)
 825             {
 826                 // plain ASCII char
 827                 if (buf)
 828                     *buf++ = (char) cc;
 829                 len++;
 830             }
 831
 832             else
 833             {
 834                 len += cnt + 1;
 835                 if (buf)
 836                 {
 837                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 838                     while (cnt--)
 839                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 840                 }
 841             }
 842         }
 843     }
 844
 845     if (buf && (len<n))
 846         *buf = 0;
 847
 848     return len;
 849 }
 850
 851 // ----------------------------------------------------------------------------
 852 // UTF-16
 853 // ----------------------------------------------------------------------------
 854
 855 #ifdef WORDS_BIGENDIAN
 856     #define wxMBConvUTF16straight wxMBConvUTF16BE
 857     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 858 #else
 859     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 860     #define wxMBConvUTF16straight wxMBConvUTF16LE
 861 #endif
 862
 863
 864 #ifdef WC_UTF16
 865
 866 // copy 16bit MB to 16bit String
 867 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 868 {
 869     size_t len=0;
 870
 871     while (*(wxUint16*)psz && (!buf || len < n))
 872     {
 873         if (buf)
 874             *buf++ = *(wxUint16*)psz;
 875         len++;
 876
 877         psz += sizeof(wxUint16);
 878     }
 879     if (buf && len<n)   *buf=0;
 880
 881     return len;
 882 }
 883
 884
 885 // copy 16bit String to 16bit MB
 886 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 887 {
 888     size_t len=0;
 889
 890     while (*psz && (!buf || len < n))
 891     {
 892         if (buf)
 893         {
 894             *(wxUint16*)buf = *psz;
 895             buf += sizeof(wxUint16);
 896         }
 897         len += sizeof(wxUint16);
 898         psz++;
 899     }
 900     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 901
 902     return len;
 903 }
 904
 905
 906 // swap 16bit MB to 16bit String
 907 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 908 {
 909     size_t len = 0;
 910
 911     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 912     // inside the string
 913     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 914     {
 915         if ( buf )
 916         {
 917             ((char *)buf)[0] = psz[1];
 918             ((char *)buf)[1] = psz[0];
 919             buf++;
 920         }
 921         len++;
 922         psz += 2;
 923     }
 924
 925     if ( buf && len < n )
 926         *buf = L'\0';
 927
 928     return len;
 929 }
 930
 931
 932 // swap 16bit MB to 16bit String
 933 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 934 {
 935     size_t len = 0;
 936
 937     while ( *psz && (!buf || len < n) )
 938     {
 939         if ( buf )
 940         {
 941             *buf++ = ((char*)psz)[1];
 942             *buf++ = ((char*)psz)[0];
 943         }
 944         len += 2;
 945         psz++;
 946     }
 947
 948     if ( buf && len < n )
 949         *buf = '\0';
 950
 951     return len;
 952 }
 953
 954
 955 #else // WC_UTF16
 956
 957
 958 // copy 16bit MB to 32bit String
 959 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 960 {
 961     size_t len=0;
 962
 963     while (*(wxUint16*)psz && (!buf || len < n))
 964     {
 965         wxUint32 cc;
 966         size_t pa=decode_utf16((wxUint16*)psz, cc);
 967         if (pa == (size_t)-1)
 968             return pa;
 969
 970         if (buf)
 971             *buf++ = (wchar_t)cc;
 972         len++;
 973         psz += pa * sizeof(wxUint16);
 974     }
 975     if (buf && len<n)   *buf=0;
 976
 977     return len;
 978 }
 979
 980
 981 // copy 32bit String to 16bit MB
 982 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 983 {
 984     size_t len=0;
 985
 986     while (*psz && (!buf || len < n))
 987     {
 988         wxUint16 cc[2];
 989         size_t pa=encode_utf16(*psz, cc);
 990
 991         if (pa == (size_t)-1)
 992             return pa;
 993
 994         if (buf)
 995         {
 996             *(wxUint16*)buf = cc[0];
 997             buf += sizeof(wxUint16);
 998             if (pa > 1)
 999             {
1000                 *(wxUint16*)buf = cc[1];
1001                 buf += sizeof(wxUint16);
1002             }
1003         }
1004
1005         len += pa*sizeof(wxUint16);
1006         psz++;
1007     }
1008     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1009
1010     return len;
1011 }
1012
1013
1014 // swap 16bit MB to 32bit String
1015 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1016 {
1017     size_t len=0;
1018
1019     while (*(wxUint16*)psz && (!buf || len < n))
1020     {
1021         wxUint32 cc;
1022         char tmp[4];
1023         tmp[0]=psz[1];  tmp[1]=psz[0];
1024         tmp[2]=psz[3];  tmp[3]=psz[2];
1025
1026         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1027         if (pa == (size_t)-1)
1028             return pa;
1029
1030         if (buf)
1031             *buf++ = (wchar_t)cc;
1032
1033         len++;
1034         psz += pa * sizeof(wxUint16);
1035     }
1036     if (buf && len<n)   *buf=0;
1037
1038     return len;
1039 }
1040
1041
1042 // swap 32bit String to 16bit MB
1043 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1044 {
1045     size_t len=0;
1046
1047     while (*psz && (!buf || len < n))
1048     {
1049         wxUint16 cc[2];
1050         size_t pa=encode_utf16(*psz, cc);
1051
1052         if (pa == (size_t)-1)
1053             return pa;
1054
1055         if (buf)
1056         {
1057             *buf++ = ((char*)cc)[1];
1058             *buf++ = ((char*)cc)[0];
1059             if (pa > 1)
1060             {
1061                 *buf++ = ((char*)cc)[3];
1062                 *buf++ = ((char*)cc)[2];
1063             }
1064         }
1065
1066         len += pa*sizeof(wxUint16);
1067         psz++;
1068     }
1069     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1070
1071     return len;
1072 }
1073
1074 #endif // WC_UTF16
1075
1076
1077 // ----------------------------------------------------------------------------
1078 // UTF-32
1079 // ----------------------------------------------------------------------------
1080
1081 #ifdef WORDS_BIGENDIAN
1082 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1083 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1084 #else
1085 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1086 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1087 #endif
1088
1089
1090 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1091 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1092
1093
1094 #ifdef WC_UTF16
1095
1096 // copy 32bit MB to 16bit String
1097 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1098 {
1099     size_t len=0;
1100
1101     while (*(wxUint32*)psz && (!buf || len < n))
1102     {
1103         wxUint16 cc[2];
1104
1105         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1106         if (pa == (size_t)-1)
1107             return pa;
1108
1109         if (buf)
1110         {
1111             *buf++ = cc[0];
1112             if (pa > 1)
1113                 *buf++ = cc[1];
1114         }
1115         len += pa;
1116         psz += sizeof(wxUint32);
1117     }
1118     if (buf && len<n)   *buf=0;
1119
1120     return len;
1121 }
1122
1123
1124 // copy 16bit String to 32bit MB
1125 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1126 {
1127     size_t len=0;
1128
1129     while (*psz && (!buf || len < n))
1130     {
1131         wxUint32 cc;
1132
1133         // cast is ok for WC_UTF16
1134         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1135         if (pa == (size_t)-1)
1136             return pa;
1137
1138         if (buf)
1139         {
1140             *(wxUint32*)buf = cc;
1141             buf += sizeof(wxUint32);
1142         }
1143         len += sizeof(wxUint32);
1144         psz += pa;
1145     }
1146
1147     if (buf && len<=n-sizeof(wxUint32))
1148         *(wxUint32*)buf=0;
1149
1150     return len;
1151 }
1152
1153
1154
1155 // swap 32bit MB to 16bit String
1156 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1157 {
1158     size_t len=0;
1159
1160     while (*(wxUint32*)psz && (!buf || len < n))
1161     {
1162         char tmp[4];
1163         tmp[0] = psz[3];   tmp[1] = psz[2];
1164         tmp[2] = psz[1];   tmp[3] = psz[0];
1165
1166
1167         wxUint16 cc[2];
1168
1169         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1170         if (pa == (size_t)-1)
1171             return pa;
1172
1173         if (buf)
1174         {
1175             *buf++ = cc[0];
1176             if (pa > 1)
1177                 *buf++ = cc[1];
1178         }
1179         len += pa;
1180         psz += sizeof(wxUint32);
1181     }
1182
1183     if (buf && len<n)
1184         *buf=0;
1185
1186     return len;
1187 }
1188
1189
1190 // swap 16bit String to 32bit MB
1191 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1192 {
1193     size_t len=0;
1194
1195     while (*psz && (!buf || len < n))
1196     {
1197         char cc[4];
1198
1199         // cast is ok for WC_UTF16
1200         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1201         if (pa == (size_t)-1)
1202             return pa;
1203
1204         if (buf)
1205         {
1206             *buf++ = cc[3];
1207             *buf++ = cc[2];
1208             *buf++ = cc[1];
1209             *buf++ = cc[0];
1210         }
1211         len += sizeof(wxUint32);
1212         psz += pa;
1213     }
1214
1215     if (buf && len<=n-sizeof(wxUint32))
1216         *(wxUint32*)buf=0;
1217
1218     return len;
1219 }
1220
1221 #else // WC_UTF16
1222
1223
1224 // copy 32bit MB to 32bit String
1225 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1226 {
1227     size_t len=0;
1228
1229     while (*(wxUint32*)psz && (!buf || len < n))
1230     {
1231         if (buf)
1232             *buf++ = (wchar_t)(*(wxUint32*)psz);
1233         len++;
1234         psz += sizeof(wxUint32);
1235     }
1236
1237     if (buf && len<n)
1238         *buf=0;
1239
1240     return len;
1241 }
1242
1243
1244 // copy 32bit String to 32bit MB
1245 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1246 {
1247     size_t len=0;
1248
1249     while (*psz && (!buf || len < n))
1250     {
1251         if (buf)
1252         {
1253             *(wxUint32*)buf = *psz;
1254             buf += sizeof(wxUint32);
1255         }
1256
1257         len += sizeof(wxUint32);
1258         psz++;
1259     }
1260
1261     if (buf && len<=n-sizeof(wxUint32))
1262         *(wxUint32*)buf=0;
1263
1264     return len;
1265 }
1266
1267
1268 // swap 32bit MB to 32bit String
1269 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1270 {
1271     size_t len=0;
1272
1273     while (*(wxUint32*)psz && (!buf || len < n))
1274     {
1275         if (buf)
1276         {
1277             ((char *)buf)[0] = psz[3];
1278             ((char *)buf)[1] = psz[2];
1279             ((char *)buf)[2] = psz[1];
1280             ((char *)buf)[3] = psz[0];
1281             buf++;
1282         }
1283         len++;
1284         psz += sizeof(wxUint32);
1285     }
1286
1287     if (buf && len<n)
1288         *buf=0;
1289
1290     return len;
1291 }
1292
1293
1294 // swap 32bit String to 32bit MB
1295 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1296 {
1297     size_t len=0;
1298
1299     while (*psz && (!buf || len < n))
1300     {
1301         if (buf)
1302         {
1303             *buf++ = ((char *)psz)[3];
1304             *buf++ = ((char *)psz)[2];
1305             *buf++ = ((char *)psz)[1];
1306             *buf++ = ((char *)psz)[0];
1307         }
1308         len += sizeof(wxUint32);
1309         psz++;
1310     }
1311
1312     if (buf && len<=n-sizeof(wxUint32))
1313         *(wxUint32*)buf=0;
1314
1315     return len;
1316 }
1317
1318
1319 #endif // WC_UTF16
1320
1321
1322 // ============================================================================
1323 // The classes doing conversion using the iconv_xxx() functions
1324 // ============================================================================
1325
1326 #ifdef HAVE_ICONV
1327
1328 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1329 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1330 //     (unless there's yet another bug in glibc) the only case when iconv()
1331 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1332 //     left in the input buffer -- when _real_ error occurs,
1333 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1334 //     iconv() failure.
1335 //     [This bug does not appear in glibc 2.2.]
1336 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1337 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1338                                      (errno != E2BIG || bufLeft != 0))
1339 #else
1340 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1341 #endif
1342
1343 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1344
1345 #define ICONV_T_INVALID ((iconv_t)-1)
1346
1347 #if SIZEOF_WCHAR_T == 4
1348     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1349     #define WC_ENC      wxFONTENCODING_UTF32
1350 #elif SIZEOF_WCHAR_T == 2
1351     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1352     #define WC_ENC      wxFONTENCODING_UTF16
1353 #else // sizeof(wchar_t) != 2 nor 4
1354     // does this ever happen?
1355     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1356 #endif
1357
1358 // ----------------------------------------------------------------------------
1359 // wxMBConv_iconv: encapsulates an iconv character set
1360 // ----------------------------------------------------------------------------
1361
1362 class wxMBConv_iconv : public wxMBConv
1363 {
1364 public:
1365     wxMBConv_iconv(const wxChar *name);
1366     virtual ~wxMBConv_iconv();
1367
1368     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1369     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1370
1371     // classify this encoding as explained in wxMBConv::GetMBNulLen()
1372     // comment
1373     virtual size_t GetMBNulLen() const;
1374
1375     bool IsOk() const
1376         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1377
1378 protected:
1379     // the iconv handlers used to translate from multibyte to wide char and in
1380     // the other direction
1381     iconv_t m2w,
1382             w2m;
1383 #if wxUSE_THREADS
1384     // guards access to m2w and w2m objects
1385     wxMutex m_iconvMutex;
1386 #endif
1387
1388 private:
1389     // the name (for iconv_open()) of a wide char charset -- if none is
1390     // available on this machine, it will remain NULL
1391     static wxString ms_wcCharsetName;
1392
1393     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1394     // different endian-ness than the native one
1395     static bool ms_wcNeedsSwap;
1396
1397     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1398     // initially
1399     size_t m_minMBCharWidth;
1400 };
1401
1402 // make the constructor available for unit testing
1403 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1404 {
1405     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1406     if ( !result->IsOk() )
1407     {
1408         delete result;
1409         return 0;
1410     }
1411     return result;
1412 }
1413
1414 wxString wxMBConv_iconv::ms_wcCharsetName;
1415 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1416
1417 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1418 {
1419     m_minMBCharWidth = 0;
1420
1421     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1422     // names for the charsets
1423     const wxCharBuffer cname(wxString(name).ToAscii());
1424
1425     // check for charset that represents wchar_t:
1426     if ( ms_wcCharsetName.empty() )
1427     {
1428         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1429
1430 #if wxUSE_FONTMAP
1431         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1432 #else // !wxUSE_FONTMAP
1433         static const wxChar *names[] =
1434         {
1435 #if SIZEOF_WCHAR_T == 4
1436             _T("UCS-4"),
1437 #elif SIZEOF_WCHAR_T = 2
1438             _T("UCS-2"),
1439 #endif
1440             NULL
1441         };
1442 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1443
1444         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1445         {
1446             const wxString nameCS(*names);
1447
1448             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1449             wxString nameXE(nameCS);
1450             #ifdef WORDS_BIGENDIAN
1451                 nameXE += _T("BE");
1452             #else // little endian
1453                 nameXE += _T("LE");
1454             #endif
1455
1456             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1457                        nameXE.c_str());
1458
1459             m2w = iconv_open(nameXE.ToAscii(), cname);
1460             if ( m2w == ICONV_T_INVALID )
1461             {
1462                 // try charset w/o bytesex info (e.g. "UCS4")
1463                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1464                            nameCS.c_str());
1465                 m2w = iconv_open(nameCS.ToAscii(), cname);
1466
1467                 // and check for bytesex ourselves:
1468                 if ( m2w != ICONV_T_INVALID )
1469                 {
1470                     char    buf[2], *bufPtr;
1471                     wchar_t wbuf[2], *wbufPtr;
1472                     size_t  insz, outsz;
1473                     size_t  res;
1474
1475                     buf[0] = 'A';
1476                     buf[1] = 0;
1477                     wbuf[0] = 0;
1478                     insz = 2;
1479                     outsz = SIZEOF_WCHAR_T * 2;
1480                     wbufPtr = wbuf;
1481                     bufPtr = buf;
1482
1483                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1484                                 (char**)&wbufPtr, &outsz);
1485
1486                     if (ICONV_FAILED(res, insz))
1487                     {
1488                         wxLogLastError(wxT("iconv"));
1489                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1490                                    nameCS.c_str());
1491                     }
1492                     else // ok, can convert to this encoding, remember it
1493                     {
1494                         ms_wcCharsetName = nameCS;
1495                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1496                     }
1497                 }
1498             }
1499             else // use charset not requiring byte swapping
1500             {
1501                 ms_wcCharsetName = nameXE;
1502             }
1503         }
1504
1505         wxLogTrace(TRACE_STRCONV,
1506                    wxT("iconv wchar_t charset is \"%s\"%s"),
1507                    ms_wcCharsetName.empty() ? _T("<none>")
1508                                             : ms_wcCharsetName.c_str(),
1509                    ms_wcNeedsSwap ? _T(" (needs swap)")
1510                                   : _T(""));
1511     }
1512     else // we already have ms_wcCharsetName
1513     {
1514         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1515     }
1516
1517     if ( ms_wcCharsetName.empty() )
1518     {
1519         w2m = ICONV_T_INVALID;
1520     }
1521     else
1522     {
1523         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1524         if ( w2m == ICONV_T_INVALID )
1525         {
1526             wxLogTrace(TRACE_STRCONV,
1527                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1528                        ms_wcCharsetName.c_str(), cname.data());
1529         }
1530     }
1531 }
1532
1533 wxMBConv_iconv::~wxMBConv_iconv()
1534 {
1535     if ( m2w != ICONV_T_INVALID )
1536         iconv_close(m2w);
1537     if ( w2m != ICONV_T_INVALID )
1538         iconv_close(w2m);
1539 }
1540
1541 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1542 {
1543     // find the string length: notice that must be done differently for
1544     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1545     size_t inbuf;
1546     const size_t nulLen = GetMBNulLen();
1547     switch ( nulLen )
1548     {
1549         default:
1550             return (size_t)-1;
1551
1552         case 1:
1553             inbuf = strlen(psz); // arguably more optimized than our version
1554             break;
1555
1556         case 2:
1557         case 4:
1558             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1559             // they also have to start at character boundary and not span two
1560             // adjacent characters
1561             const char *p;
1562             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1563                 ;
1564             inbuf = p - psz;
1565             break;
1566     }
1567
1568 #if wxUSE_THREADS
1569     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1570     //     Unfortunately there is a couple of global wxCSConv objects such as
1571     //     wxConvLocal that are used all over wx code, so we have to make sure
1572     //     the handle is used by at most one thread at the time. Otherwise
1573     //     only a few wx classes would be safe to use from non-main threads
1574     //     as MB<->WC conversion would fail "randomly".
1575     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1576 #endif // wxUSE_THREADS
1577
1578
1579     size_t outbuf = n * SIZEOF_WCHAR_T;
1580     size_t res, cres;
1581     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1582     wchar_t *bufPtr = buf;
1583     const char *pszPtr = psz;
1584
1585     if (buf)
1586     {
1587         // have destination buffer, convert there
1588         cres = iconv(m2w,
1589                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1590                      (char**)&bufPtr, &outbuf);
1591         res = n - (outbuf / SIZEOF_WCHAR_T);
1592
1593         if (ms_wcNeedsSwap)
1594         {
1595             // convert to native endianness
1596             for ( unsigned i = 0; i < res; i++ )
1597                 buf[n] = WC_BSWAP(buf[i]);
1598         }
1599
1600         // NUL-terminate the string if there is any space left
1601         if (res < n)
1602             buf[res] = 0;
1603     }
1604     else
1605     {
1606         // no destination buffer... convert using temp buffer
1607         // to calculate destination buffer requirement
1608         wchar_t tbuf[8];
1609         res = 0;
1610         do {
1611             bufPtr = tbuf;
1612             outbuf = 8*SIZEOF_WCHAR_T;
1613
1614             cres = iconv(m2w,
1615                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1616                          (char**)&bufPtr, &outbuf );
1617
1618             res += 8-(outbuf/SIZEOF_WCHAR_T);
1619         } while ((cres==(size_t)-1) && (errno==E2BIG));
1620     }
1621
1622     if (ICONV_FAILED(cres, inbuf))
1623     {
1624         //VS: it is ok if iconv fails, hence trace only
1625         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1626         return (size_t)-1;
1627     }
1628
1629     return res;
1630 }
1631
1632 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1633 {
1634 #if wxUSE_THREADS
1635     // NB: explained in MB2WC
1636     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1637 #endif
1638
1639     size_t inlen = wxWcslen(psz);
1640     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1641     size_t outbuf = n;
1642     size_t res, cres;
1643
1644     wchar_t *tmpbuf = 0;
1645
1646     if (ms_wcNeedsSwap)
1647     {
1648         // need to copy to temp buffer to switch endianness
1649         // (doing WC_BSWAP twice on the original buffer won't help, as it
1650         //  could be in read-only memory, or be accessed in some other thread)
1651         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1652         for ( size_t i = 0; i < inlen; i++ )
1653             tmpbuf[n] = WC_BSWAP(psz[i]);
1654         tmpbuf[inlen] = L'\0';
1655         psz = tmpbuf;
1656     }
1657
1658     if (buf)
1659     {
1660         // have destination buffer, convert there
1661         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1662
1663         res = n-outbuf;
1664
1665         // NB: iconv was given only wcslen(psz) characters on input, and so
1666         //     it couldn't convert the trailing zero. Let's do it ourselves
1667         //     if there's some room left for it in the output buffer.
1668         if (res < n)
1669             buf[0] = 0;
1670     }
1671     else
1672     {
1673         // no destination buffer... convert using temp buffer
1674         // to calculate destination buffer requirement
1675         char tbuf[16];
1676         res = 0;
1677         do {
1678             buf = tbuf; outbuf = 16;
1679
1680             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1681
1682             res += 16 - outbuf;
1683         } while ((cres==(size_t)-1) && (errno==E2BIG));
1684     }
1685
1686     if (ms_wcNeedsSwap)
1687     {
1688         free(tmpbuf);
1689     }
1690
1691     if (ICONV_FAILED(cres, inbuf))
1692     {
1693         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1694         return (size_t)-1;
1695     }
1696
1697     return res;
1698 }
1699
1700 size_t wxMBConv_iconv::GetMBNulLen() const
1701 {
1702     if ( m_minMBCharWidth == 0 )
1703     {
1704         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1705
1706 #if wxUSE_THREADS
1707         // NB: explained in MB2WC
1708         wxMutexLocker lock(self->m_iconvMutex);
1709 #endif
1710
1711         wchar_t *wnul = L"";
1712         char buf[8]; // should be enough for NUL in any encoding
1713         size_t inLen = sizeof(wchar_t),
1714                outLen = WXSIZEOF(buf);
1715         char *in = (char *)wnul;
1716         char *out = buf;
1717         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1718         {
1719             self->m_minMBCharWidth = (size_t)-1;
1720         }
1721         else // ok
1722         {
1723             self->m_minMBCharWidth = out - buf;
1724         }
1725     }
1726
1727     return m_minMBCharWidth;
1728 }
1729
1730 #endif // HAVE_ICONV
1731
1732
1733 // ============================================================================
1734 // Win32 conversion classes
1735 // ============================================================================
1736
1737 #ifdef wxHAVE_WIN32_MB2WC
1738
1739 // from utils.cpp
1740 #if wxUSE_FONTMAP
1741 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1742 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1743 #endif
1744
1745 class wxMBConv_win32 : public wxMBConv
1746 {
1747 public:
1748     wxMBConv_win32()
1749     {
1750         m_CodePage = CP_ACP;
1751         m_minMBCharWidth = 0;
1752     }
1753
1754 #if wxUSE_FONTMAP
1755     wxMBConv_win32(const wxChar* name)
1756     {
1757         m_CodePage = wxCharsetToCodepage(name);
1758         m_minMBCharWidth = 0;
1759     }
1760
1761     wxMBConv_win32(wxFontEncoding encoding)
1762     {
1763         m_CodePage = wxEncodingToCodepage(encoding);
1764         m_minMBCharWidth = 0;
1765     }
1766 #endif // wxUSE_FONTMAP
1767
1768     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1769     {
1770         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1771         // the behaviour is not compatible with the Unix version (using iconv)
1772         // and break the library itself, e.g. wxTextInputStream::NextChar()
1773         // wouldn't work if reading an incomplete MB char didn't result in an
1774         // error
1775         //
1776         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1777         // an error (tested under Windows Server 2003) and apparently it is
1778         // done on purpose, i.e. the function accepts any input in this case
1779         // and although I'd prefer to return error on ill-formed output, our
1780         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1781         // explicitly ill-formed according to RFC 2152) neither so we don't
1782         // even have any fallback here...
1783         //
1784         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1785         // Win XP or newer and if it is specified on older versions, conversion
1786         // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1787         // fails. So we can only use the flag on newer Windows versions.
1788         // Additionally, the flag is not supported by UTF7, symbol and CJK
1789         // encodings. See here:
1790         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1791         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1792         int flags = 0;
1793         if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1794              m_CodePage < 50000 &&
1795              IsAtLeastWin2kSP4() )
1796         {
1797             flags = MB_ERR_INVALID_CHARS;
1798         }
1799         else if ( m_CodePage == CP_UTF8 )
1800         {
1801             // Avoid round-trip in the special case of UTF-8 by using our
1802             // own UTF-8 conversion code:
1803             return wxMBConvUTF8().MB2WC(buf, psz, n);
1804         }
1805
1806         const size_t len = ::MultiByteToWideChar
1807                              (
1808                                 m_CodePage,     // code page
1809                                 flags,          // flags: fall on error
1810                                 psz,            // input string
1811                                 -1,             // its length (NUL-terminated)
1812                                 buf,            // output string
1813                                 buf ? n : 0     // size of output buffer
1814                              );
1815         if ( !len )
1816         {
1817             // function totally failed
1818             return (size_t)-1;
1819         }
1820
1821         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1822         // check if we succeeded, by doing a double trip:
1823         if ( !flags && buf )
1824         {
1825             const size_t mbLen = strlen(psz);
1826             wxCharBuffer mbBuf(mbLen);
1827             if ( ::WideCharToMultiByte
1828                    (
1829                       m_CodePage,
1830                       0,
1831                       buf,
1832                       -1,
1833                       mbBuf.data(),
1834                       mbLen + 1,        // size in bytes, not length
1835                       NULL,
1836                       NULL
1837                    ) == 0 ||
1838                   strcmp(mbBuf, psz) != 0 )
1839             {
1840                 // we didn't obtain the same thing we started from, hence
1841                 // the conversion was lossy and we consider that it failed
1842                 return (size_t)-1;
1843             }
1844         }
1845
1846         // note that it returns count of written chars for buf != NULL and size
1847         // of the needed buffer for buf == NULL so in either case the length of
1848         // the string (which never includes the terminating NUL) is one less
1849         return len - 1;
1850     }
1851
1852     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1853     {
1854         /*
1855             we have a problem here: by default, WideCharToMultiByte() may
1856             replace characters unrepresentable in the target code page with bad
1857             quality approximations such as turning "1/2" symbol (U+00BD) into
1858             "1" for the code pages which don't have it and we, obviously, want
1859             to avoid this at any price
1860
1861             the trouble is that this function does it _silently_, i.e. it won't
1862             even tell us whether it did or not... Win98/2000 and higher provide
1863             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1864             we have to resort to a round trip, i.e. check that converting back
1865             results in the same string -- this is, of course, expensive but
1866             otherwise we simply can't be sure to not garble the data.
1867          */
1868
1869         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1870         // it doesn't work with CJK encodings (which we test for rather roughly
1871         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1872         // supporting it
1873         BOOL usedDef wxDUMMY_INITIALIZE(false);
1874         BOOL *pUsedDef;
1875         int flags;
1876         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1877         {
1878             // it's our lucky day
1879             flags = WC_NO_BEST_FIT_CHARS;
1880             pUsedDef = &usedDef;
1881         }
1882         else // old system or unsupported encoding
1883         {
1884             flags = 0;
1885             pUsedDef = NULL;
1886         }
1887
1888         const size_t len = ::WideCharToMultiByte
1889                              (
1890                                 m_CodePage,     // code page
1891                                 flags,          // either none or no best fit
1892                                 pwz,            // input string
1893                                 -1,             // it is (wide) NUL-terminated
1894                                 buf,            // output buffer
1895                                 buf ? n : 0,    // and its size
1896                                 NULL,           // default "replacement" char
1897                                 pUsedDef        // [out] was it used?
1898                              );
1899
1900         if ( !len )
1901         {
1902             // function totally failed
1903             return (size_t)-1;
1904         }
1905
1906         // if we were really converting, check if we succeeded
1907         if ( buf )
1908         {
1909             if ( flags )
1910             {
1911                 // check if the conversion failed, i.e. if any replacements
1912                 // were done
1913                 if ( usedDef )
1914                     return (size_t)-1;
1915             }
1916             else // we must resort to double tripping...
1917             {
1918                 wxWCharBuffer wcBuf(n);
1919                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1920                         wcscmp(wcBuf, pwz) != 0 )
1921                 {
1922                     // we didn't obtain the same thing we started from, hence
1923                     // the conversion was lossy and we consider that it failed
1924                     return (size_t)-1;
1925                 }
1926             }
1927         }
1928
1929         // see the comment above for the reason of "len - 1"
1930         return len - 1;
1931     }
1932
1933     virtual size_t GetMBNulLen() const
1934     {
1935         if ( m_minMBCharWidth == 0 )
1936         {
1937             int len = ::WideCharToMultiByte
1938                         (
1939                             m_CodePage,     // code page
1940                             0,              // no flags
1941                             L"",            // input string
1942                             1,              // translate just the NUL
1943                             NULL,           // output buffer
1944                             0,              // and its size
1945                             NULL,           // no replacement char
1946                             NULL            // [out] don't care if it was used
1947                         );
1948
1949             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1950             switch ( len )
1951             {
1952                 default:
1953                     wxLogDebug(_T("Unexpected NUL length %d"), len);
1954                     // fall through
1955
1956                 case 0:
1957                     self->m_minMBCharWidth = (size_t)-1;
1958                     break;
1959
1960                 case 1:
1961                 case 2:
1962                 case 4:
1963                     self->m_minMBCharWidth = len;
1964                     break;
1965             }
1966         }
1967
1968         return m_minMBCharWidth;
1969     }
1970
1971     bool IsOk() const { return m_CodePage != -1; }
1972
1973 private:
1974     static bool CanUseNoBestFit()
1975     {
1976         static int s_isWin98Or2k = -1;
1977
1978         if ( s_isWin98Or2k == -1 )
1979         {
1980             int verMaj, verMin;
1981             switch ( wxGetOsVersion(&verMaj, &verMin) )
1982             {
1983                 case wxWIN95:
1984                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1985                     break;
1986
1987                 case wxWINDOWS_NT:
1988                     s_isWin98Or2k = verMaj >= 5;
1989                     break;
1990
1991                 default:
1992                     // unknown, be conseravtive by default
1993                     s_isWin98Or2k = 0;
1994             }
1995
1996             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1997         }
1998
1999         return s_isWin98Or2k == 1;
2000     }
2001
2002     static bool IsAtLeastWin2kSP4()
2003     {
2004 #ifdef __WXWINCE__
2005         return false;
2006 #else
2007         static int s_isAtLeastWin2kSP4 = -1;
2008
2009         if ( s_isAtLeastWin2kSP4 == -1 )
2010         {
2011             OSVERSIONINFOEX ver;
2012
2013             memset(&ver, 0, sizeof(ver));
2014             ver.dwOSVersionInfoSize = sizeof(ver);
2015             GetVersionEx((OSVERSIONINFO*)&ver);
2016
2017             s_isAtLeastWin2kSP4 =
2018               ((ver.dwMajorVersion > 5) || // Vista+
2019                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2020                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2021                ver.wServicePackMajor >= 4)) // 2000 SP4+
2022               ? 1 : 0;
2023         }
2024
2025         return s_isAtLeastWin2kSP4 == 1;
2026 #endif
2027     }
2028
2029
2030     // the code page we're working with
2031     long m_CodePage;
2032
2033     // cached result of GetMBNulLen(), set to 0 initially meaning
2034     // "unknown"
2035     size_t m_minMBCharWidth;
2036 };
2037
2038 #endif // wxHAVE_WIN32_MB2WC
2039
2040 // ============================================================================
2041 // Cocoa conversion classes
2042 // ============================================================================
2043
2044 #if defined(__WXCOCOA__)
2045
2046 // RN:  There is no UTF-32 support in either Core Foundation or
2047 // Cocoa.  Strangely enough, internally Core Foundation uses
2048 // UTF 32 internally quite a bit - its just not public (yet).
2049
2050 #include <CoreFoundation/CFString.h>
2051 #include <CoreFoundation/CFStringEncodingExt.h>
2052
2053 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2054 {
2055     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2056     if ( encoding == wxFONTENCODING_DEFAULT )
2057     {
2058         enc = CFStringGetSystemEncoding();
2059     }
2060     else switch( encoding)
2061     {
2062         case wxFONTENCODING_ISO8859_1 :
2063             enc = kCFStringEncodingISOLatin1 ;
2064             break ;
2065         case wxFONTENCODING_ISO8859_2 :
2066             enc = kCFStringEncodingISOLatin2;
2067             break ;
2068         case wxFONTENCODING_ISO8859_3 :
2069             enc = kCFStringEncodingISOLatin3 ;
2070             break ;
2071         case wxFONTENCODING_ISO8859_4 :
2072             enc = kCFStringEncodingISOLatin4;
2073             break ;
2074         case wxFONTENCODING_ISO8859_5 :
2075             enc = kCFStringEncodingISOLatinCyrillic;
2076             break ;
2077         case wxFONTENCODING_ISO8859_6 :
2078             enc = kCFStringEncodingISOLatinArabic;
2079             break ;
2080         case wxFONTENCODING_ISO8859_7 :
2081             enc = kCFStringEncodingISOLatinGreek;
2082             break ;
2083         case wxFONTENCODING_ISO8859_8 :
2084             enc = kCFStringEncodingISOLatinHebrew;
2085             break ;
2086         case wxFONTENCODING_ISO8859_9 :
2087             enc = kCFStringEncodingISOLatin5;
2088             break ;
2089         case wxFONTENCODING_ISO8859_10 :
2090             enc = kCFStringEncodingISOLatin6;
2091             break ;
2092         case wxFONTENCODING_ISO8859_11 :
2093             enc = kCFStringEncodingISOLatinThai;
2094             break ;
2095         case wxFONTENCODING_ISO8859_13 :
2096             enc = kCFStringEncodingISOLatin7;
2097             break ;
2098         case wxFONTENCODING_ISO8859_14 :
2099             enc = kCFStringEncodingISOLatin8;
2100             break ;
2101         case wxFONTENCODING_ISO8859_15 :
2102             enc = kCFStringEncodingISOLatin9;
2103             break ;
2104
2105         case wxFONTENCODING_KOI8 :
2106             enc = kCFStringEncodingKOI8_R;
2107             break ;
2108         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2109             enc = kCFStringEncodingDOSRussian;
2110             break ;
2111
2112 //      case wxFONTENCODING_BULGARIAN :
2113 //          enc = ;
2114 //          break ;
2115
2116         case wxFONTENCODING_CP437 :
2117             enc =kCFStringEncodingDOSLatinUS ;
2118             break ;
2119         case wxFONTENCODING_CP850 :
2120             enc = kCFStringEncodingDOSLatin1;
2121             break ;
2122         case wxFONTENCODING_CP852 :
2123             enc = kCFStringEncodingDOSLatin2;
2124             break ;
2125         case wxFONTENCODING_CP855 :
2126             enc = kCFStringEncodingDOSCyrillic;
2127             break ;
2128         case wxFONTENCODING_CP866 :
2129             enc =kCFStringEncodingDOSRussian ;
2130             break ;
2131         case wxFONTENCODING_CP874 :
2132             enc = kCFStringEncodingDOSThai;
2133             break ;
2134         case wxFONTENCODING_CP932 :
2135             enc = kCFStringEncodingDOSJapanese;
2136             break ;
2137         case wxFONTENCODING_CP936 :
2138             enc =kCFStringEncodingDOSChineseSimplif ;
2139             break ;
2140         case wxFONTENCODING_CP949 :
2141             enc = kCFStringEncodingDOSKorean;
2142             break ;
2143         case wxFONTENCODING_CP950 :
2144             enc = kCFStringEncodingDOSChineseTrad;
2145             break ;
2146         case wxFONTENCODING_CP1250 :
2147             enc = kCFStringEncodingWindowsLatin2;
2148             break ;
2149         case wxFONTENCODING_CP1251 :
2150             enc =kCFStringEncodingWindowsCyrillic ;
2151             break ;
2152         case wxFONTENCODING_CP1252 :
2153             enc =kCFStringEncodingWindowsLatin1 ;
2154             break ;
2155         case wxFONTENCODING_CP1253 :
2156             enc = kCFStringEncodingWindowsGreek;
2157             break ;
2158         case wxFONTENCODING_CP1254 :
2159             enc = kCFStringEncodingWindowsLatin5;
2160             break ;
2161         case wxFONTENCODING_CP1255 :
2162             enc =kCFStringEncodingWindowsHebrew ;
2163             break ;
2164         case wxFONTENCODING_CP1256 :
2165             enc =kCFStringEncodingWindowsArabic ;
2166             break ;
2167         case wxFONTENCODING_CP1257 :
2168             enc = kCFStringEncodingWindowsBalticRim;
2169             break ;
2170 //   This only really encodes to UTF7 (if that) evidently
2171 //        case wxFONTENCODING_UTF7 :
2172 //            enc = kCFStringEncodingNonLossyASCII ;
2173 //            break ;
2174         case wxFONTENCODING_UTF8 :
2175             enc = kCFStringEncodingUTF8 ;
2176             break ;
2177         case wxFONTENCODING_EUC_JP :
2178             enc = kCFStringEncodingEUC_JP;
2179             break ;
2180         case wxFONTENCODING_UTF16 :
2181             enc = kCFStringEncodingUnicode ;
2182             break ;
2183         case wxFONTENCODING_MACROMAN :
2184             enc = kCFStringEncodingMacRoman ;
2185             break ;
2186         case wxFONTENCODING_MACJAPANESE :
2187             enc = kCFStringEncodingMacJapanese ;
2188             break ;
2189         case wxFONTENCODING_MACCHINESETRAD :
2190             enc = kCFStringEncodingMacChineseTrad ;
2191             break ;
2192         case wxFONTENCODING_MACKOREAN :
2193             enc = kCFStringEncodingMacKorean ;
2194             break ;
2195         case wxFONTENCODING_MACARABIC :
2196             enc = kCFStringEncodingMacArabic ;
2197             break ;
2198         case wxFONTENCODING_MACHEBREW :
2199             enc = kCFStringEncodingMacHebrew ;
2200             break ;
2201         case wxFONTENCODING_MACGREEK :
2202             enc = kCFStringEncodingMacGreek ;
2203             break ;
2204         case wxFONTENCODING_MACCYRILLIC :
2205             enc = kCFStringEncodingMacCyrillic ;
2206             break ;
2207         case wxFONTENCODING_MACDEVANAGARI :
2208             enc = kCFStringEncodingMacDevanagari ;
2209             break ;
2210         case wxFONTENCODING_MACGURMUKHI :
2211             enc = kCFStringEncodingMacGurmukhi ;
2212             break ;
2213         case wxFONTENCODING_MACGUJARATI :
2214             enc = kCFStringEncodingMacGujarati ;
2215             break ;
2216         case wxFONTENCODING_MACORIYA :
2217             enc = kCFStringEncodingMacOriya ;
2218             break ;
2219         case wxFONTENCODING_MACBENGALI :
2220             enc = kCFStringEncodingMacBengali ;
2221             break ;
2222         case wxFONTENCODING_MACTAMIL :
2223             enc = kCFStringEncodingMacTamil ;
2224             break ;
2225         case wxFONTENCODING_MACTELUGU :
2226             enc = kCFStringEncodingMacTelugu ;
2227             break ;
2228         case wxFONTENCODING_MACKANNADA :
2229             enc = kCFStringEncodingMacKannada ;
2230             break ;
2231         case wxFONTENCODING_MACMALAJALAM :
2232             enc = kCFStringEncodingMacMalayalam ;
2233             break ;
2234         case wxFONTENCODING_MACSINHALESE :
2235             enc = kCFStringEncodingMacSinhalese ;
2236             break ;
2237         case wxFONTENCODING_MACBURMESE :
2238             enc = kCFStringEncodingMacBurmese ;
2239             break ;
2240         case wxFONTENCODING_MACKHMER :
2241             enc = kCFStringEncodingMacKhmer ;
2242             break ;
2243         case wxFONTENCODING_MACTHAI :
2244             enc = kCFStringEncodingMacThai ;
2245             break ;
2246         case wxFONTENCODING_MACLAOTIAN :
2247             enc = kCFStringEncodingMacLaotian ;
2248             break ;
2249         case wxFONTENCODING_MACGEORGIAN :
2250             enc = kCFStringEncodingMacGeorgian ;
2251             break ;
2252         case wxFONTENCODING_MACARMENIAN :
2253             enc = kCFStringEncodingMacArmenian ;
2254             break ;
2255         case wxFONTENCODING_MACCHINESESIMP :
2256             enc = kCFStringEncodingMacChineseSimp ;
2257             break ;
2258         case wxFONTENCODING_MACTIBETAN :
2259             enc = kCFStringEncodingMacTibetan ;
2260             break ;
2261         case wxFONTENCODING_MACMONGOLIAN :
2262             enc = kCFStringEncodingMacMongolian ;
2263             break ;
2264         case wxFONTENCODING_MACETHIOPIC :
2265             enc = kCFStringEncodingMacEthiopic ;
2266             break ;
2267         case wxFONTENCODING_MACCENTRALEUR :
2268             enc = kCFStringEncodingMacCentralEurRoman ;
2269             break ;
2270         case wxFONTENCODING_MACVIATNAMESE :
2271             enc = kCFStringEncodingMacVietnamese ;
2272             break ;
2273         case wxFONTENCODING_MACARABICEXT :
2274             enc = kCFStringEncodingMacExtArabic ;
2275             break ;
2276         case wxFONTENCODING_MACSYMBOL :
2277             enc = kCFStringEncodingMacSymbol ;
2278             break ;
2279         case wxFONTENCODING_MACDINGBATS :
2280             enc = kCFStringEncodingMacDingbats ;
2281             break ;
2282         case wxFONTENCODING_MACTURKISH :
2283             enc = kCFStringEncodingMacTurkish ;
2284             break ;
2285         case wxFONTENCODING_MACCROATIAN :
2286             enc = kCFStringEncodingMacCroatian ;
2287             break ;
2288         case wxFONTENCODING_MACICELANDIC :
2289             enc = kCFStringEncodingMacIcelandic ;
2290             break ;
2291         case wxFONTENCODING_MACROMANIAN :
2292             enc = kCFStringEncodingMacRomanian ;
2293             break ;
2294         case wxFONTENCODING_MACCELTIC :
2295             enc = kCFStringEncodingMacCeltic ;
2296             break ;
2297         case wxFONTENCODING_MACGAELIC :
2298             enc = kCFStringEncodingMacGaelic ;
2299             break ;
2300 //      case wxFONTENCODING_MACKEYBOARD :
2301 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2302 //          break ;
2303         default :
2304             // because gcc is picky
2305             break ;
2306     } ;
2307     return enc ;
2308 }
2309
2310 class wxMBConv_cocoa : public wxMBConv
2311 {
2312 public:
2313     wxMBConv_cocoa()
2314     {
2315         Init(CFStringGetSystemEncoding()) ;
2316     }
2317
2318 #if wxUSE_FONTMAP
2319     wxMBConv_cocoa(const wxChar* name)
2320     {
2321         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2322     }
2323 #endif
2324
2325     wxMBConv_cocoa(wxFontEncoding encoding)
2326     {
2327         Init( wxCFStringEncFromFontEnc(encoding) );
2328     }
2329
2330     ~wxMBConv_cocoa()
2331     {
2332     }
2333
2334     void Init( CFStringEncoding encoding)
2335     {
2336         m_encoding = encoding ;
2337     }
2338
2339     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2340     {
2341         wxASSERT(szUnConv);
2342
2343         CFStringRef theString = CFStringCreateWithBytes (
2344                                                 NULL, //the allocator
2345                                                 (const UInt8*)szUnConv,
2346                                                 strlen(szUnConv),
2347                                                 m_encoding,
2348                                                 false //no BOM/external representation
2349                                                 );
2350
2351         wxASSERT(theString);
2352
2353         size_t nOutLength = CFStringGetLength(theString);
2354
2355         if (szOut == NULL)
2356         {
2357             CFRelease(theString);
2358             return nOutLength;
2359         }
2360
2361         CFRange theRange = { 0, nOutSize };
2362
2363 #if SIZEOF_WCHAR_T == 4
2364         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2365 #endif
2366
2367         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2368
2369         CFRelease(theString);
2370
2371         szUniCharBuffer[nOutLength] = '\0' ;
2372
2373 #if SIZEOF_WCHAR_T == 4
2374         wxMBConvUTF16 converter ;
2375         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2376         delete[] szUniCharBuffer;
2377 #endif
2378
2379         return nOutLength;
2380     }
2381
2382     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2383     {
2384         wxASSERT(szUnConv);
2385
2386         size_t nRealOutSize;
2387         size_t nBufSize = wxWcslen(szUnConv);
2388         UniChar* szUniBuffer = (UniChar*) szUnConv;
2389
2390 #if SIZEOF_WCHAR_T == 4
2391         wxMBConvUTF16 converter ;
2392         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2393         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2394         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2395         nBufSize /= sizeof(UniChar);
2396 #endif
2397
2398         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2399                                 NULL, //allocator
2400                                 szUniBuffer,
2401                                 nBufSize,
2402                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2403                             );
2404
2405         wxASSERT(theString);
2406
2407         //Note that CER puts a BOM when converting to unicode
2408         //so we  check and use getchars instead in that case
2409         if (m_encoding == kCFStringEncodingUnicode)
2410         {
2411             if (szOut != NULL)
2412                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2413
2414             nRealOutSize = CFStringGetLength(theString) + 1;
2415         }
2416         else
2417         {
2418             CFStringGetBytes(
2419                 theString,
2420                 CFRangeMake(0, CFStringGetLength(theString)),
2421                 m_encoding,
2422                 0, //what to put in characters that can't be converted -
2423                     //0 tells CFString to return NULL if it meets such a character
2424                 false, //not an external representation
2425                 (UInt8*) szOut,
2426                 nOutSize,
2427                 (CFIndex*) &nRealOutSize
2428                         );
2429         }
2430
2431         CFRelease(theString);
2432
2433 #if SIZEOF_WCHAR_T == 4
2434         delete[] szUniBuffer;
2435 #endif
2436
2437         return  nRealOutSize - 1;
2438     }
2439
2440     bool IsOk() const
2441     {
2442         return m_encoding != kCFStringEncodingInvalidId &&
2443               CFStringIsEncodingAvailable(m_encoding);
2444     }
2445
2446 private:
2447     CFStringEncoding m_encoding ;
2448 };
2449
2450 #endif // defined(__WXCOCOA__)
2451
2452 // ============================================================================
2453 // Mac conversion classes
2454 // ============================================================================
2455
2456 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2457
2458 class wxMBConv_mac : public wxMBConv
2459 {
2460 public:
2461     wxMBConv_mac()
2462     {
2463         Init(CFStringGetSystemEncoding()) ;
2464     }
2465
2466 #if wxUSE_FONTMAP
2467     wxMBConv_mac(const wxChar* name)
2468     {
2469         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2470     }
2471 #endif
2472
2473     wxMBConv_mac(wxFontEncoding encoding)
2474     {
2475         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2476     }
2477
2478     ~wxMBConv_mac()
2479     {
2480         OSStatus status = noErr ;
2481         status = TECDisposeConverter(m_MB2WC_converter);
2482         status = TECDisposeConverter(m_WC2MB_converter);
2483     }
2484
2485
2486     void Init( TextEncodingBase encoding)
2487     {
2488         OSStatus status = noErr ;
2489         m_char_encoding = encoding ;
2490         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2491
2492         status = TECCreateConverter(&m_MB2WC_converter,
2493                                     m_char_encoding,
2494                                     m_unicode_encoding);
2495         status = TECCreateConverter(&m_WC2MB_converter,
2496                                     m_unicode_encoding,
2497                                     m_char_encoding);
2498     }
2499
2500     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2501     {
2502         OSStatus status = noErr ;
2503         ByteCount byteOutLen ;
2504         ByteCount byteInLen = strlen(psz) ;
2505         wchar_t *tbuf = NULL ;
2506         UniChar* ubuf = NULL ;
2507         size_t res = 0 ;
2508
2509         if (buf == NULL)
2510         {
2511             //apple specs say at least 32
2512             n = wxMax( 32 , byteInLen ) ;
2513             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2514         }
2515         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2516 #if SIZEOF_WCHAR_T == 4
2517         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2518 #else
2519         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2520 #endif
2521         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2522           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2523 #if SIZEOF_WCHAR_T == 4
2524         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2525         // is not properly terminated we get random characters at the end
2526         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2527         wxMBConvUTF16 converter ;
2528         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2529         free( ubuf ) ;
2530 #else
2531         res = byteOutLen / sizeof( UniChar ) ;
2532 #endif
2533         if ( buf == NULL )
2534              free(tbuf) ;
2535
2536         if ( buf  && res < n)
2537             buf[res] = 0;
2538
2539         return res ;
2540     }
2541
2542     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2543     {
2544         OSStatus status = noErr ;
2545         ByteCount byteOutLen ;
2546         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2547
2548         char *tbuf = NULL ;
2549
2550         if (buf == NULL)
2551         {
2552             //apple specs say at least 32
2553             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2554             tbuf = (char*) malloc( n ) ;
2555         }
2556
2557         ByteCount byteBufferLen = n ;
2558         UniChar* ubuf = NULL ;
2559 #if SIZEOF_WCHAR_T == 4
2560         wxMBConvUTF16 converter ;
2561         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2562         byteInLen = unicharlen ;
2563         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2564         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2565 #else
2566         ubuf = (UniChar*) psz ;
2567 #endif
2568         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2569             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2570 #if SIZEOF_WCHAR_T == 4
2571         free( ubuf ) ;
2572 #endif
2573         if ( buf == NULL )
2574             free(tbuf) ;
2575
2576         size_t res = byteOutLen ;
2577         if ( buf  && res < n)
2578         {
2579             buf[res] = 0;
2580
2581             //we need to double-trip to verify it didn't insert any ? in place
2582             //of bogus characters
2583             wxWCharBuffer wcBuf(n);
2584             size_t pszlen = wxWcslen(psz);
2585             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2586                         wxWcslen(wcBuf) != pszlen ||
2587                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2588             {
2589                 // we didn't obtain the same thing we started from, hence
2590                 // the conversion was lossy and we consider that it failed
2591                 return (size_t)-1;
2592             }
2593         }
2594
2595         return res ;
2596     }
2597
2598     bool IsOk() const
2599         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2600
2601 private:
2602     TECObjectRef m_MB2WC_converter ;
2603     TECObjectRef m_WC2MB_converter ;
2604
2605     TextEncodingBase m_char_encoding ;
2606     TextEncodingBase m_unicode_encoding ;
2607 };
2608
2609 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2610
2611 // ============================================================================
2612 // wxEncodingConverter based conversion classes
2613 // ============================================================================
2614
2615 #if wxUSE_FONTMAP
2616
2617 class wxMBConv_wxwin : public wxMBConv
2618 {
2619 private:
2620     void Init()
2621     {
2622         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2623                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2624     }
2625
2626 public:
2627     // temporarily just use wxEncodingConverter stuff,
2628     // so that it works while a better implementation is built
2629     wxMBConv_wxwin(const wxChar* name)
2630     {
2631         if (name)
2632             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2633         else
2634             m_enc = wxFONTENCODING_SYSTEM;
2635
2636         Init();
2637     }
2638
2639     wxMBConv_wxwin(wxFontEncoding enc)
2640     {
2641         m_enc = enc;
2642
2643         Init();
2644     }
2645
2646     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2647     {
2648         size_t inbuf = strlen(psz);
2649         if (buf)
2650         {
2651             if (!m2w.Convert(psz,buf))
2652                 return (size_t)-1;
2653         }
2654         return inbuf;
2655     }
2656
2657     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2658     {
2659         const size_t inbuf = wxWcslen(psz);
2660         if (buf)
2661         {
2662             if (!w2m.Convert(psz,buf))
2663                 return (size_t)-1;
2664         }
2665
2666         return inbuf;
2667     }
2668
2669     virtual size_t GetMBNulLen() const
2670     {
2671         switch ( m_enc )
2672         {
2673             case wxFONTENCODING_UTF16BE:
2674             case wxFONTENCODING_UTF16LE:
2675                 return 2;
2676
2677             case wxFONTENCODING_UTF32BE:
2678             case wxFONTENCODING_UTF32LE:
2679                 return 4;
2680
2681             default:
2682                 return 1;
2683         }
2684     }
2685
2686     bool IsOk() const { return m_ok; }
2687
2688 public:
2689     wxFontEncoding m_enc;
2690     wxEncodingConverter m2w, w2m;
2691
2692 private:
2693     // were we initialized successfully?
2694     bool m_ok;
2695
2696     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2697 };
2698
2699 // make the constructors available for unit testing
2700 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2701 {
2702     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2703     if ( !result->IsOk() )
2704     {
2705         delete result;
2706         return 0;
2707     }
2708     return result;
2709 }
2710
2711 #endif // wxUSE_FONTMAP
2712
2713 // ============================================================================
2714 // wxCSConv implementation
2715 // ============================================================================
2716
2717 void wxCSConv::Init()
2718 {
2719     m_name = NULL;
2720     m_convReal =  NULL;
2721     m_deferred = true;
2722 }
2723
2724 wxCSConv::wxCSConv(const wxChar *charset)
2725 {
2726     Init();
2727
2728     if ( charset )
2729     {
2730         SetName(charset);
2731     }
2732
2733 #if wxUSE_FONTMAP
2734     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2735 #else
2736     m_encoding = wxFONTENCODING_SYSTEM;
2737 #endif
2738 }
2739
2740 wxCSConv::wxCSConv(wxFontEncoding encoding)
2741 {
2742     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2743     {
2744         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2745
2746         encoding = wxFONTENCODING_SYSTEM;
2747     }
2748
2749     Init();
2750
2751     m_encoding = encoding;
2752 }
2753
2754 wxCSConv::~wxCSConv()
2755 {
2756     Clear();
2757 }
2758
2759 wxCSConv::wxCSConv(const wxCSConv& conv)
2760         : wxMBConv()
2761 {
2762     Init();
2763
2764     SetName(conv.m_name);
2765     m_encoding = conv.m_encoding;
2766 }
2767
2768 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2769 {
2770     Clear();
2771
2772     SetName(conv.m_name);
2773     m_encoding = conv.m_encoding;
2774
2775     return *this;
2776 }
2777
2778 void wxCSConv::Clear()
2779 {
2780     free(m_name);
2781     delete m_convReal;
2782
2783     m_name = NULL;
2784     m_convReal = NULL;
2785 }
2786
2787 void wxCSConv::SetName(const wxChar *charset)
2788 {
2789     if (charset)
2790     {
2791         m_name = wxStrdup(charset);
2792         m_deferred = true;
2793     }
2794 }
2795
2796 #if wxUSE_FONTMAP
2797 #include "wx/hashmap.h"
2798
2799 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2800                      wxEncodingNameCache );
2801
2802 static wxEncodingNameCache gs_nameCache;
2803 #endif
2804
2805 wxMBConv *wxCSConv::DoCreate() const
2806 {
2807 #if wxUSE_FONTMAP
2808     wxLogTrace(TRACE_STRCONV,
2809                wxT("creating conversion for %s"),
2810                (m_name ? m_name
2811                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2812 #endif // wxUSE_FONTMAP
2813
2814     // check for the special case of ASCII or ISO8859-1 charset: as we have
2815     // special knowledge of it anyhow, we don't need to create a special
2816     // conversion object
2817     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2818             m_encoding == wxFONTENCODING_DEFAULT )
2819     {
2820         // don't convert at all
2821         return NULL;
2822     }
2823
2824     // we trust OS to do conversion better than we can so try external
2825     // conversion methods first
2826     //
2827     // the full order is:
2828     //      1. OS conversion (iconv() under Unix or Win32 API)
2829     //      2. hard coded conversions for UTF
2830     //      3. wxEncodingConverter as fall back
2831
2832     // step (1)
2833 #ifdef HAVE_ICONV
2834 #if !wxUSE_FONTMAP
2835     if ( m_name )
2836 #endif // !wxUSE_FONTMAP
2837     {
2838         wxString name(m_name);
2839         wxFontEncoding encoding(m_encoding);
2840
2841         if ( !name.empty() )
2842         {
2843             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2844             if ( conv->IsOk() )
2845                 return conv;
2846
2847             delete conv;
2848
2849 #if wxUSE_FONTMAP
2850             encoding =
2851                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2852 #endif // wxUSE_FONTMAP
2853         }
2854 #if wxUSE_FONTMAP
2855         {
2856             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2857             if ( it != gs_nameCache.end() )
2858             {
2859                 if ( it->second.empty() )
2860                     return NULL;
2861
2862                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2863                 if ( conv->IsOk() )
2864                     return conv;
2865
2866                 delete conv;
2867             }
2868
2869             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2870
2871             for ( ; *names; ++names )
2872             {
2873                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2874                 if ( conv->IsOk() )
2875                 {
2876                     gs_nameCache[encoding] = *names;
2877                     return conv;
2878                 }
2879
2880                 delete conv;
2881             }
2882
2883             gs_nameCache[encoding] = _T(""); // cache the failure
2884         }
2885 #endif // wxUSE_FONTMAP
2886     }
2887 #endif // HAVE_ICONV
2888
2889 #ifdef wxHAVE_WIN32_MB2WC
2890     {
2891 #if wxUSE_FONTMAP
2892         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2893                                       : new wxMBConv_win32(m_encoding);
2894         if ( conv->IsOk() )
2895             return conv;
2896
2897         delete conv;
2898 #else
2899         return NULL;
2900 #endif
2901     }
2902 #endif // wxHAVE_WIN32_MB2WC
2903 #if defined(__WXMAC__)
2904     {
2905         // leave UTF16 and UTF32 to the built-ins of wx
2906         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2907             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2908         {
2909
2910 #if wxUSE_FONTMAP
2911             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2912                                         : new wxMBConv_mac(m_encoding);
2913 #else
2914             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2915 #endif
2916             if ( conv->IsOk() )
2917                  return conv;
2918
2919             delete conv;
2920         }
2921     }
2922 #endif
2923 #if defined(__WXCOCOA__)
2924     {
2925         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2926         {
2927
2928 #if wxUSE_FONTMAP
2929             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2930                                           : new wxMBConv_cocoa(m_encoding);
2931 #else
2932             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2933 #endif
2934             if ( conv->IsOk() )
2935                  return conv;
2936
2937             delete conv;
2938         }
2939     }
2940 #endif
2941     // step (2)
2942     wxFontEncoding enc = m_encoding;
2943 #if wxUSE_FONTMAP
2944     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2945     {
2946         // use "false" to suppress interactive dialogs -- we can be called from
2947         // anywhere and popping up a dialog from here is the last thing we want to
2948         // do
2949         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2950     }
2951 #endif // wxUSE_FONTMAP
2952
2953     switch ( enc )
2954     {
2955         case wxFONTENCODING_UTF7:
2956              return new wxMBConvUTF7;
2957
2958         case wxFONTENCODING_UTF8:
2959              return new wxMBConvUTF8;
2960
2961         case wxFONTENCODING_UTF16BE:
2962              return new wxMBConvUTF16BE;
2963
2964         case wxFONTENCODING_UTF16LE:
2965              return new wxMBConvUTF16LE;
2966
2967         case wxFONTENCODING_UTF32BE:
2968              return new wxMBConvUTF32BE;
2969
2970         case wxFONTENCODING_UTF32LE:
2971              return new wxMBConvUTF32LE;
2972
2973         default:
2974              // nothing to do but put here to suppress gcc warnings
2975              ;
2976     }
2977
2978     // step (3)
2979 #if wxUSE_FONTMAP
2980     {
2981         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2982                                       : new wxMBConv_wxwin(m_encoding);
2983         if ( conv->IsOk() )
2984             return conv;
2985
2986         delete conv;
2987     }
2988 #endif // wxUSE_FONTMAP
2989
2990     // NB: This is a hack to prevent deadlock. What could otherwise happen
2991     //     in Unicode build: wxConvLocal creation ends up being here
2992     //     because of some failure and logs the error. But wxLog will try to
2993     //     attach timestamp, for which it will need wxConvLocal (to convert
2994     //     time to char* and then wchar_t*), but that fails, tries to log
2995     //     error, but wxLog has a (already locked) critical section that
2996     //     guards static buffer.
2997     static bool alreadyLoggingError = false;
2998     if (!alreadyLoggingError)
2999     {
3000         alreadyLoggingError = true;
3001         wxLogError(_("Cannot convert from the charset '%s'!"),
3002                    m_name ? m_name
3003                       :
3004 #if wxUSE_FONTMAP
3005                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3006 #else // !wxUSE_FONTMAP
3007                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3008 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3009               );
3010         alreadyLoggingError = false;
3011     }
3012
3013     return NULL;
3014 }
3015
3016 void wxCSConv::CreateConvIfNeeded() const
3017 {
3018     if ( m_deferred )
3019     {
3020         wxCSConv *self = (wxCSConv *)this; // const_cast
3021
3022 #if wxUSE_INTL
3023         // if we don't have neither the name nor the encoding, use the default
3024         // encoding for this system
3025         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3026         {
3027             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3028         }
3029 #endif // wxUSE_INTL
3030
3031         self->m_convReal = DoCreate();
3032         self->m_deferred = false;
3033     }
3034 }
3035
3036 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3037 {
3038     CreateConvIfNeeded();
3039
3040     if (m_convReal)
3041         return m_convReal->MB2WC(buf, psz, n);
3042
3043     // latin-1 (direct)
3044     size_t len = strlen(psz);
3045
3046     if (buf)
3047     {
3048         for (size_t c = 0; c <= len; c++)
3049             buf[c] = (unsigned char)(psz[c]);
3050     }
3051
3052     return len;
3053 }
3054
3055 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3056 {
3057     CreateConvIfNeeded();
3058
3059     if (m_convReal)
3060         return m_convReal->WC2MB(buf, psz, n);
3061
3062     // latin-1 (direct)
3063     const size_t len = wxWcslen(psz);
3064     if (buf)
3065     {
3066         for (size_t c = 0; c <= len; c++)
3067         {
3068             if (psz[c] > 0xFF)
3069                 return (size_t)-1;
3070             buf[c] = (char)psz[c];
3071         }
3072     }
3073     else
3074     {
3075         for (size_t c = 0; c <= len; c++)
3076         {
3077             if (psz[c] > 0xFF)
3078                 return (size_t)-1;
3079         }
3080     }
3081
3082     return len;
3083 }
3084
3085 size_t wxCSConv::GetMBNulLen() const
3086 {
3087     CreateConvIfNeeded();
3088
3089     if ( m_convReal )
3090     {
3091         return m_convReal->GetMBNulLen();
3092     }
3093
3094     return 1;
3095 }
3096
3097 // ----------------------------------------------------------------------------
3098 // globals
3099 // ----------------------------------------------------------------------------
3100
3101 #ifdef __WINDOWS__
3102     static wxMBConv_win32 wxConvLibcObj;
3103 #elif defined(__WXMAC__) && !defined(__MACH__)
3104     static wxMBConv_mac wxConvLibcObj ;
3105 #else
3106     static wxMBConvLibc wxConvLibcObj;
3107 #endif
3108
3109 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3110 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3111 static wxMBConvUTF7 wxConvUTF7Obj;
3112 static wxMBConvUTF8 wxConvUTF8Obj;
3113
3114 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3115 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3116 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3117 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3118 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3119 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3120 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3121 #ifdef __WXOSX__
3122                                     wxConvUTF8Obj;
3123 #else
3124                                     wxConvLibcObj;
3125 #endif
3126
3127
3128 #else // !wxUSE_WCHAR_T
3129
3130 // stand-ins in absence of wchar_t
3131 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3132                                 wxConvISO8859_1,
3133                                 wxConvLocal,
3134                                 wxConvUTF8;
3135
3136 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T