src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  90 static bool NotAllNULs(const char *p, size_t n)
  91 {
  92     while ( n && *p++ == '\0' )
  93         n--;
  94
  95     return n != 0;
  96 }
  97
  98 // ----------------------------------------------------------------------------
  99 // UTF-16 en/decoding to/from UCS-4
 100 // ----------------------------------------------------------------------------
 101
 102
 103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 104 {
 105     if (input<=0xffff)
 106     {
 107         if (output)
 108             *output = (wxUint16) input;
 109         return 1;
 110     }
 111     else if (input>=0x110000)
 112     {
 113         return (size_t)-1;
 114     }
 115     else
 116     {
 117         if (output)
 118         {
 119             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 120             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 121         }
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input<0xd800) || (*input>0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 134     {
 135         output = *input;
 136         return (size_t)-1;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145
 146 // ----------------------------------------------------------------------------
 147 // wxMBConv
 148 // ----------------------------------------------------------------------------
 149
 150 wxMBConv::~wxMBConv()
 151 {
 152     // nothing to do here (necessary for Darwin linking probably)
 153 }
 154
 155 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 156 {
 157     if ( psz )
 158     {
 159         // calculate the length of the buffer needed first
 160         size_t nLen = MB2WC(NULL, psz, 0);
 161         if ( nLen != (size_t)-1 )
 162         {
 163             // now do the actual conversion
 164             wxWCharBuffer buf(nLen);
 165             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 166             if ( nLen != (size_t)-1 )
 167             {
 168                 return buf;
 169             }
 170         }
 171     }
 172
 173     wxWCharBuffer buf((wchar_t *)NULL);
 174
 175     return buf;
 176 }
 177
 178 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 179 {
 180     if ( pwz )
 181     {
 182         size_t nLen = WC2MB(NULL, pwz, 0);
 183         if ( nLen != (size_t)-1 )
 184         {
 185             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 186             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 187             if ( nLen != (size_t)-1 )
 188             {
 189                 return buf;
 190             }
 191         }
 192     }
 193
 194     wxCharBuffer buf((char *)NULL);
 195
 196     return buf;
 197 }
 198
 199 const wxWCharBuffer
 200 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 201 {
 202     // the currently accumulated wide characters
 203     wxWCharBuffer wbuf;
 204
 205     // the current length of wbuf
 206     size_t lenBuf = 0;
 207
 208     // the number of NULs terminating this string
 209     size_t nulLen   wxDUMMY_INITIALIZE(0);
 210
 211     // make a copy of the input string unless it is already properly
 212     // NUL-terminated
 213     wxCharBuffer bufTmp;
 214
 215     // if we were not given the input size we just have to assume that the
 216     // string is properly terminated as we have no way of knowing how long it
 217     // is anyhow, but if we do have the size check whether there are enough
 218     // NULs at the end
 219     if ( inLen != (size_t)-1 )
 220     {
 221         // we need to know how to find the end of this string
 222         nulLen = GetMinMBCharWidth();
 223         if ( nulLen == (size_t)-1 )
 224             return wbuf;
 225
 226         // if there are enough NULs we can avoid the copy
 227         if ( inLen < nulLen || NotAllNULs(in + inLen - nulLen, nulLen) )
 228         {
 229             // make a copy in order to properly NUL-terminate the string
 230             bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
 231             char * const p = bufTmp.data();
 232             memcpy(p, in, inLen);
 233             for ( char *s = p + inLen; s < p + inLen + nulLen; s++ )
 234                 *s = '\0';
 235         }
 236     }
 237
 238     if ( bufTmp )
 239         in = bufTmp;
 240
 241     size_t lenChunk;
 242     for ( const char * const inEnd = in + inLen;; )
 243     {
 244         // try to convert the current chunk
 245         lenChunk = MB2WC(NULL, in, 0);
 246         if ( lenChunk == 0 )
 247         {
 248             // nothing left in the input string, conversion succeeded
 249             break;
 250         }
 251
 252         if ( lenChunk == (size_t)-1 )
 253             break;
 254
 255         // if we already have a previous chunk, leave the NUL separating it
 256         // from this one
 257         if ( lenBuf )
 258             lenBuf++;
 259
 260         const size_t lenBufNew = lenBuf + lenChunk;
 261         if ( !wbuf.extend(lenBufNew) )
 262         {
 263             lenChunk = (size_t)-1;
 264             break;
 265         }
 266
 267         lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 268         if ( lenChunk == (size_t)-1 )
 269             break;
 270
 271         lenBuf = lenBufNew;
 272
 273         if ( inLen == (size_t)-1 )
 274         {
 275             // convert only one chunk in this case, as we suppose that the
 276             // string is NUL-terminated and so inEnd is not used at all
 277             break;
 278         }
 279
 280         // advance the input pointer past the end of this chunk
 281         while ( NotAllNULs(in, nulLen) )
 282         {
 283             // notice that we must skip over multiple bytes here as we suppose
 284             // that if NUL takes 2 or 4 bytes, then all the other characters do
 285             // too and so if advanced by a single byte we might erroneously
 286             // detect sequences of NUL bytes in the middle of the input
 287             in += nulLen;
 288         }
 289
 290         in += nulLen; // skipping over its terminator as well
 291
 292         // note that ">=" (and not just "==") is needed here as the terminator
 293         // we skipped just above could be inside or just after the buffer
 294         // delimited by inEnd
 295         if ( in >= inEnd )
 296             break;
 297     }
 298
 299     if ( lenChunk == (size_t)-1 )
 300     {
 301         // conversion failed
 302         lenBuf = 0;
 303         wbuf.reset();
 304     }
 305
 306     if ( outLen )
 307         *outLen = lenBuf;
 308
 309     return wbuf;
 310 }
 311
 312 const wxCharBuffer
 313 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 314 {
 315     // the currently accumulated multibyte characters
 316     wxCharBuffer buf;
 317
 318     // the current length of buf
 319     size_t lenBuf = 0;
 320
 321     // make a copy of the input string unless it is already properly
 322     // NUL-terminated
 323     //
 324     // if we don't know its length we have no choice but to assume that it is,
 325     // indeed, properly terminated
 326     wxWCharBuffer bufTmp;
 327     if ( inLen == (size_t)-1 )
 328     {
 329         inLen = wxWcslen(in) + 1;
 330     }
 331     else if ( inLen != 0 && in[inLen - 1] != L'\0' )
 332     {
 333         // make a copy in order to properly NUL-terminate the string
 334         bufTmp = wxWCharBuffer(inLen);
 335         memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t));
 336     }
 337
 338     if ( bufTmp )
 339         in = bufTmp;
 340
 341     for ( const wchar_t * const inEnd = in + inLen;; )
 342     {
 343         // try to convert the current chunk, if anything left
 344         size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0;
 345         if ( lenChunk == 0 )
 346         {
 347             // nothing left in the input string, conversion succeeded
 348             if ( outLen )
 349                 *outLen = lenBuf ? lenBuf - 1 : lenBuf;
 350
 351             return buf;
 352         }
 353
 354         if ( lenChunk == (size_t)-1 )
 355             break;
 356
 357         const size_t lenBufNew = lenBuf + lenChunk;
 358         if ( !buf.extend(lenBufNew) )
 359             break;
 360
 361         lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 362         if ( lenChunk == (size_t)-1 )
 363             break;
 364
 365         // chunk successfully converted, go to the next one
 366         in += wxWcslen(in) + 1 /* skip NUL too */;
 367         lenBuf = lenBufNew + 1;
 368     }
 369
 370     // conversion failed
 371     if ( outLen )
 372         *outLen = 0;
 373
 374     return wxCharBuffer();
 375 }
 376
 377 // ----------------------------------------------------------------------------
 378 // wxMBConvLibc
 379 // ----------------------------------------------------------------------------
 380
 381 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 382 {
 383     return wxMB2WC(buf, psz, n);
 384 }
 385
 386 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 387 {
 388     return wxWC2MB(buf, psz, n);
 389 }
 390
 391 // ----------------------------------------------------------------------------
 392 // wxConvBrokenFileNames
 393 // ----------------------------------------------------------------------------
 394
 395 #ifdef __UNIX__
 396
 397 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 398 {
 399     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 400                   || wxStricmp(charset, _T("UTF8")) == 0  )
 401         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 402     else
 403         m_conv = new wxCSConv(charset);
 404 }
 405
 406 #endif // __UNIX__
 407
 408 // ----------------------------------------------------------------------------
 409 // UTF-7
 410 // ----------------------------------------------------------------------------
 411
 412 // Implementation (C) 2004 Fredrik Roubert
 413
 414 //
 415 // BASE64 decoding table
 416 //
 417 static const unsigned char utf7unb64[] =
 418 {
 419     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 420     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 421     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 422     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 423     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 424     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 425     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 426     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 427     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 428     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 429     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 430     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 431     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 432     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 433     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 434     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 435     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 436     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 437     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 438     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 439     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 440     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 441     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 442     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 443     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 444     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 445     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 446     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 447     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 448     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 449     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 450     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 451 };
 452
 453 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 454 {
 455     size_t len = 0;
 456
 457     while ( *psz && (!buf || (len < n)) )
 458     {
 459         unsigned char cc = *psz++;
 460         if (cc != '+')
 461         {
 462             // plain ASCII char
 463             if (buf)
 464                 *buf++ = cc;
 465             len++;
 466         }
 467         else if (*psz == '-')
 468         {
 469             // encoded plus sign
 470             if (buf)
 471                 *buf++ = cc;
 472             len++;
 473             psz++;
 474         }
 475         else // start of BASE64 encoded string
 476         {
 477             bool lsb, ok;
 478             unsigned int d, l;
 479             for ( ok = lsb = false, d = 0, l = 0;
 480                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 481                   psz++ )
 482             {
 483                 d <<= 6;
 484                 d += cc;
 485                 for (l += 6; l >= 8; lsb = !lsb)
 486                 {
 487                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 488                     if (lsb)
 489                     {
 490                         if (buf)
 491                             *buf++ |= c;
 492                         len ++;
 493                     }
 494                     else
 495                     {
 496                         if (buf)
 497                             *buf = (wchar_t)(c << 8);
 498                     }
 499
 500                     ok = true;
 501                 }
 502             }
 503
 504             if ( !ok )
 505             {
 506                 // in valid UTF7 we should have valid characters after '+'
 507                 return (size_t)-1;
 508             }
 509
 510             if (*psz == '-')
 511                 psz++;
 512         }
 513     }
 514
 515     if ( buf && (len < n) )
 516         *buf = '\0';
 517
 518     return len;
 519 }
 520
 521 //
 522 // BASE64 encoding table
 523 //
 524 static const unsigned char utf7enb64[] =
 525 {
 526     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 527     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 528     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 529     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 530     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 531     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 532     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 533     '4', '5', '6', '7', '8', '9', '+', '/'
 534 };
 535
 536 //
 537 // UTF-7 encoding table
 538 //
 539 // 0 - Set D (directly encoded characters)
 540 // 1 - Set O (optional direct characters)
 541 // 2 - whitespace characters (optional)
 542 // 3 - special characters
 543 //
 544 static const unsigned char utf7encode[128] =
 545 {
 546     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 547     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 548     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 549     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 550     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 551     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 552     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 553     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 554 };
 555
 556 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 557 {
 558     size_t len = 0;
 559
 560     while (*psz && ((!buf) || (len < n)))
 561     {
 562         wchar_t cc = *psz++;
 563         if (cc < 0x80 && utf7encode[cc] < 1)
 564         {
 565             // plain ASCII char
 566             if (buf)
 567                 *buf++ = (char)cc;
 568             len++;
 569         }
 570 #ifndef WC_UTF16
 571         else if (((wxUint32)cc) > 0xffff)
 572         {
 573             // no surrogate pair generation (yet?)
 574             return (size_t)-1;
 575         }
 576 #endif
 577         else
 578         {
 579             if (buf)
 580                 *buf++ = '+';
 581             len++;
 582             if (cc != '+')
 583             {
 584                 // BASE64 encode string
 585                 unsigned int lsb, d, l;
 586                 for (d = 0, l = 0; /*nothing*/; psz++)
 587                 {
 588                     for (lsb = 0; lsb < 2; lsb ++)
 589                     {
 590                         d <<= 8;
 591                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 592
 593                         for (l += 8; l >= 6; )
 594                         {
 595                             l -= 6;
 596                             if (buf)
 597                                 *buf++ = utf7enb64[(d >> l) % 64];
 598                             len++;
 599                         }
 600                     }
 601                     cc = *psz;
 602                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 603                         break;
 604                 }
 605                 if (l != 0)
 606                 {
 607                     if (buf)
 608                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 609                     len++;
 610                 }
 611             }
 612             if (buf)
 613                 *buf++ = '-';
 614             len++;
 615         }
 616     }
 617     if (buf && (len < n))
 618         *buf = 0;
 619     return len;
 620 }
 621
 622 // ----------------------------------------------------------------------------
 623 // UTF-8
 624 // ----------------------------------------------------------------------------
 625
 626 static wxUint32 utf8_max[]=
 627     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 628
 629 // boundaries of the private use area we use to (temporarily) remap invalid
 630 // characters invalid in a UTF-8 encoded string
 631 const wxUint32 wxUnicodePUA = 0x100000;
 632 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 633
 634 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 635 {
 636     size_t len = 0;
 637
 638     while (*psz && ((!buf) || (len < n)))
 639     {
 640         const char *opsz = psz;
 641         bool invalid = false;
 642         unsigned char cc = *psz++, fc = cc;
 643         unsigned cnt;
 644         for (cnt = 0; fc & 0x80; cnt++)
 645             fc <<= 1;
 646         if (!cnt)
 647         {
 648             // plain ASCII char
 649             if (buf)
 650                 *buf++ = cc;
 651             len++;
 652
 653             // escape the escape character for octal escapes
 654             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 655                     && cc == '\\' && (!buf || len < n))
 656             {
 657                 if (buf)
 658                     *buf++ = cc;
 659                 len++;
 660             }
 661         }
 662         else
 663         {
 664             cnt--;
 665             if (!cnt)
 666             {
 667                 // invalid UTF-8 sequence
 668                 invalid = true;
 669             }
 670             else
 671             {
 672                 unsigned ocnt = cnt - 1;
 673                 wxUint32 res = cc & (0x3f >> cnt);
 674                 while (cnt--)
 675                 {
 676                     cc = *psz;
 677                     if ((cc & 0xC0) != 0x80)
 678                     {
 679                         // invalid UTF-8 sequence
 680                         invalid = true;
 681                         break;
 682                     }
 683                     psz++;
 684                     res = (res << 6) | (cc & 0x3f);
 685                 }
 686                 if (invalid || res <= utf8_max[ocnt])
 687                 {
 688                     // illegal UTF-8 encoding
 689                     invalid = true;
 690                 }
 691                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 692                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 693                 {
 694                     // if one of our PUA characters turns up externally
 695                     // it must also be treated as an illegal sequence
 696                     // (a bit like you have to escape an escape character)
 697                     invalid = true;
 698                 }
 699                 else
 700                 {
 701 #ifdef WC_UTF16
 702                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 703                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 704                     if (pa == (size_t)-1)
 705                     {
 706                         invalid = true;
 707                     }
 708                     else
 709                     {
 710                         if (buf)
 711                             buf += pa;
 712                         len += pa;
 713                     }
 714 #else // !WC_UTF16
 715                     if (buf)
 716                         *buf++ = (wchar_t)res;
 717                     len++;
 718 #endif // WC_UTF16/!WC_UTF16
 719                 }
 720             }
 721             if (invalid)
 722             {
 723                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 724                 {
 725                     while (opsz < psz && (!buf || len < n))
 726                     {
 727 #ifdef WC_UTF16
 728                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 729                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 730                         wxASSERT(pa != (size_t)-1);
 731                         if (buf)
 732                             buf += pa;
 733                         opsz++;
 734                         len += pa;
 735 #else
 736                         if (buf)
 737                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 738                         opsz++;
 739                         len++;
 740 #endif
 741                     }
 742                 }
 743                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 744                 {
 745                     while (opsz < psz && (!buf || len < n))
 746                     {
 747                         if ( buf && len + 3 < n )
 748                         {
 749                             unsigned char on = *opsz;
 750                             *buf++ = L'\\';
 751                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 752                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 753                             *buf++ = (wchar_t)( L'0' + on % 010 );
 754                         }
 755                         opsz++;
 756                         len += 4;
 757                     }
 758                 }
 759                 else // MAP_INVALID_UTF8_NOT
 760                 {
 761                     return (size_t)-1;
 762                 }
 763             }
 764         }
 765     }
 766     if (buf && (len < n))
 767         *buf = 0;
 768     return len;
 769 }
 770
 771 static inline bool isoctal(wchar_t wch)
 772 {
 773     return L'0' <= wch && wch <= L'7';
 774 }
 775
 776 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 777 {
 778     size_t len = 0;
 779
 780     while (*psz && ((!buf) || (len < n)))
 781     {
 782         wxUint32 cc;
 783 #ifdef WC_UTF16
 784         // cast is ok for WC_UTF16
 785         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 786         psz += (pa == (size_t)-1) ? 1 : pa;
 787 #else
 788         cc=(*psz++) & 0x7fffffff;
 789 #endif
 790
 791         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 792                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 793         {
 794             if (buf)
 795                 *buf++ = (char)(cc - wxUnicodePUA);
 796             len++;
 797         }
 798         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 799                     && cc == L'\\' && psz[0] == L'\\' )
 800         {
 801             if (buf)
 802                 *buf++ = (char)cc;
 803             psz++;
 804             len++;
 805         }
 806         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 807                     cc == L'\\' &&
 808                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 809         {
 810             if (buf)
 811             {
 812                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 813                                  (psz[1] - L'0')*010 +
 814                                  (psz[2] - L'0'));
 815             }
 816
 817             psz += 3;
 818             len++;
 819         }
 820         else
 821         {
 822             unsigned cnt;
 823             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 824             if (!cnt)
 825             {
 826                 // plain ASCII char
 827                 if (buf)
 828                     *buf++ = (char) cc;
 829                 len++;
 830             }
 831
 832             else
 833             {
 834                 len += cnt + 1;
 835                 if (buf)
 836                 {
 837                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 838                     while (cnt--)
 839                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 840                 }
 841             }
 842         }
 843     }
 844
 845     if (buf && (len<n))
 846         *buf = 0;
 847
 848     return len;
 849 }
 850
 851 // ----------------------------------------------------------------------------
 852 // UTF-16
 853 // ----------------------------------------------------------------------------
 854
 855 #ifdef WORDS_BIGENDIAN
 856     #define wxMBConvUTF16straight wxMBConvUTF16BE
 857     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 858 #else
 859     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 860     #define wxMBConvUTF16straight wxMBConvUTF16LE
 861 #endif
 862
 863
 864 #ifdef WC_UTF16
 865
 866 // copy 16bit MB to 16bit String
 867 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 868 {
 869     size_t len=0;
 870
 871     while (*(wxUint16*)psz && (!buf || len < n))
 872     {
 873         if (buf)
 874             *buf++ = *(wxUint16*)psz;
 875         len++;
 876
 877         psz += sizeof(wxUint16);
 878     }
 879     if (buf && len<n)   *buf=0;
 880
 881     return len;
 882 }
 883
 884
 885 // copy 16bit String to 16bit MB
 886 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 887 {
 888     size_t len=0;
 889
 890     while (*psz && (!buf || len < n))
 891     {
 892         if (buf)
 893         {
 894             *(wxUint16*)buf = *psz;
 895             buf += sizeof(wxUint16);
 896         }
 897         len += sizeof(wxUint16);
 898         psz++;
 899     }
 900     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 901
 902     return len;
 903 }
 904
 905
 906 // swap 16bit MB to 16bit String
 907 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 908 {
 909     size_t len = 0;
 910
 911     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 912     // inside the string
 913     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 914     {
 915         if ( buf )
 916         {
 917             ((char *)buf)[0] = psz[1];
 918             ((char *)buf)[1] = psz[0];
 919             buf++;
 920         }
 921         len++;
 922         psz += 2;
 923     }
 924
 925     if ( buf && len < n )
 926         *buf = L'\0';
 927
 928     return len;
 929 }
 930
 931
 932 // swap 16bit MB to 16bit String
 933 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 934 {
 935     size_t len = 0;
 936
 937     while ( *psz && (!buf || len < n) )
 938     {
 939         if ( buf )
 940         {
 941             *buf++ = ((char*)psz)[1];
 942             *buf++ = ((char*)psz)[0];
 943         }
 944         len += 2;
 945         psz++;
 946     }
 947
 948     if ( buf && len < n )
 949         *buf = '\0';
 950
 951     return len;
 952 }
 953
 954
 955 #else // WC_UTF16
 956
 957
 958 // copy 16bit MB to 32bit String
 959 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 960 {
 961     size_t len=0;
 962
 963     while (*(wxUint16*)psz && (!buf || len < n))
 964     {
 965         wxUint32 cc;
 966         size_t pa=decode_utf16((wxUint16*)psz, cc);
 967         if (pa == (size_t)-1)
 968             return pa;
 969
 970         if (buf)
 971             *buf++ = (wchar_t)cc;
 972         len++;
 973         psz += pa * sizeof(wxUint16);
 974     }
 975     if (buf && len<n)   *buf=0;
 976
 977     return len;
 978 }
 979
 980
 981 // copy 32bit String to 16bit MB
 982 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 983 {
 984     size_t len=0;
 985
 986     while (*psz && (!buf || len < n))
 987     {
 988         wxUint16 cc[2];
 989         size_t pa=encode_utf16(*psz, cc);
 990
 991         if (pa == (size_t)-1)
 992             return pa;
 993
 994         if (buf)
 995         {
 996             *(wxUint16*)buf = cc[0];
 997             buf += sizeof(wxUint16);
 998             if (pa > 1)
 999             {
1000                 *(wxUint16*)buf = cc[1];
1001                 buf += sizeof(wxUint16);
1002             }
1003         }
1004
1005         len += pa*sizeof(wxUint16);
1006         psz++;
1007     }
1008     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1009
1010     return len;
1011 }
1012
1013
1014 // swap 16bit MB to 32bit String
1015 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1016 {
1017     size_t len=0;
1018
1019     while (*(wxUint16*)psz && (!buf || len < n))
1020     {
1021         wxUint32 cc;
1022         char tmp[4];
1023         tmp[0]=psz[1];  tmp[1]=psz[0];
1024         tmp[2]=psz[3];  tmp[3]=psz[2];
1025
1026         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1027         if (pa == (size_t)-1)
1028             return pa;
1029
1030         if (buf)
1031             *buf++ = (wchar_t)cc;
1032
1033         len++;
1034         psz += pa * sizeof(wxUint16);
1035     }
1036     if (buf && len<n)   *buf=0;
1037
1038     return len;
1039 }
1040
1041
1042 // swap 32bit String to 16bit MB
1043 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1044 {
1045     size_t len=0;
1046
1047     while (*psz && (!buf || len < n))
1048     {
1049         wxUint16 cc[2];
1050         size_t pa=encode_utf16(*psz, cc);
1051
1052         if (pa == (size_t)-1)
1053             return pa;
1054
1055         if (buf)
1056         {
1057             *buf++ = ((char*)cc)[1];
1058             *buf++ = ((char*)cc)[0];
1059             if (pa > 1)
1060             {
1061                 *buf++ = ((char*)cc)[3];
1062                 *buf++ = ((char*)cc)[2];
1063             }
1064         }
1065
1066         len += pa*sizeof(wxUint16);
1067         psz++;
1068     }
1069     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1070
1071     return len;
1072 }
1073
1074 #endif // WC_UTF16
1075
1076
1077 // ----------------------------------------------------------------------------
1078 // UTF-32
1079 // ----------------------------------------------------------------------------
1080
1081 #ifdef WORDS_BIGENDIAN
1082 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1083 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1084 #else
1085 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1086 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1087 #endif
1088
1089
1090 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1091 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1092
1093
1094 #ifdef WC_UTF16
1095
1096 // copy 32bit MB to 16bit String
1097 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1098 {
1099     size_t len=0;
1100
1101     while (*(wxUint32*)psz && (!buf || len < n))
1102     {
1103         wxUint16 cc[2];
1104
1105         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1106         if (pa == (size_t)-1)
1107             return pa;
1108
1109         if (buf)
1110         {
1111             *buf++ = cc[0];
1112             if (pa > 1)
1113                 *buf++ = cc[1];
1114         }
1115         len += pa;
1116         psz += sizeof(wxUint32);
1117     }
1118     if (buf && len<n)   *buf=0;
1119
1120     return len;
1121 }
1122
1123
1124 // copy 16bit String to 32bit MB
1125 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1126 {
1127     size_t len=0;
1128
1129     while (*psz && (!buf || len < n))
1130     {
1131         wxUint32 cc;
1132
1133         // cast is ok for WC_UTF16
1134         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1135         if (pa == (size_t)-1)
1136             return pa;
1137
1138         if (buf)
1139         {
1140             *(wxUint32*)buf = cc;
1141             buf += sizeof(wxUint32);
1142         }
1143         len += sizeof(wxUint32);
1144         psz += pa;
1145     }
1146
1147     if (buf && len<=n-sizeof(wxUint32))
1148         *(wxUint32*)buf=0;
1149
1150     return len;
1151 }
1152
1153
1154
1155 // swap 32bit MB to 16bit String
1156 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1157 {
1158     size_t len=0;
1159
1160     while (*(wxUint32*)psz && (!buf || len < n))
1161     {
1162         char tmp[4];
1163         tmp[0] = psz[3];   tmp[1] = psz[2];
1164         tmp[2] = psz[1];   tmp[3] = psz[0];
1165
1166
1167         wxUint16 cc[2];
1168
1169         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1170         if (pa == (size_t)-1)
1171             return pa;
1172
1173         if (buf)
1174         {
1175             *buf++ = cc[0];
1176             if (pa > 1)
1177                 *buf++ = cc[1];
1178         }
1179         len += pa;
1180         psz += sizeof(wxUint32);
1181     }
1182
1183     if (buf && len<n)
1184         *buf=0;
1185
1186     return len;
1187 }
1188
1189
1190 // swap 16bit String to 32bit MB
1191 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1192 {
1193     size_t len=0;
1194
1195     while (*psz && (!buf || len < n))
1196     {
1197         char cc[4];
1198
1199         // cast is ok for WC_UTF16
1200         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1201         if (pa == (size_t)-1)
1202             return pa;
1203
1204         if (buf)
1205         {
1206             *buf++ = cc[3];
1207             *buf++ = cc[2];
1208             *buf++ = cc[1];
1209             *buf++ = cc[0];
1210         }
1211         len += sizeof(wxUint32);
1212         psz += pa;
1213     }
1214
1215     if (buf && len<=n-sizeof(wxUint32))
1216         *(wxUint32*)buf=0;
1217
1218     return len;
1219 }
1220
1221 #else // WC_UTF16
1222
1223
1224 // copy 32bit MB to 32bit String
1225 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1226 {
1227     size_t len=0;
1228
1229     while (*(wxUint32*)psz && (!buf || len < n))
1230     {
1231         if (buf)
1232             *buf++ = (wchar_t)(*(wxUint32*)psz);
1233         len++;
1234         psz += sizeof(wxUint32);
1235     }
1236
1237     if (buf && len<n)
1238         *buf=0;
1239
1240     return len;
1241 }
1242
1243
1244 // copy 32bit String to 32bit MB
1245 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1246 {
1247     size_t len=0;
1248
1249     while (*psz && (!buf || len < n))
1250     {
1251         if (buf)
1252         {
1253             *(wxUint32*)buf = *psz;
1254             buf += sizeof(wxUint32);
1255         }
1256
1257         len += sizeof(wxUint32);
1258         psz++;
1259     }
1260
1261     if (buf && len<=n-sizeof(wxUint32))
1262         *(wxUint32*)buf=0;
1263
1264     return len;
1265 }
1266
1267
1268 // swap 32bit MB to 32bit String
1269 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1270 {
1271     size_t len=0;
1272
1273     while (*(wxUint32*)psz && (!buf || len < n))
1274     {
1275         if (buf)
1276         {
1277             ((char *)buf)[0] = psz[3];
1278             ((char *)buf)[1] = psz[2];
1279             ((char *)buf)[2] = psz[1];
1280             ((char *)buf)[3] = psz[0];
1281             buf++;
1282         }
1283         len++;
1284         psz += sizeof(wxUint32);
1285     }
1286
1287     if (buf && len<n)
1288         *buf=0;
1289
1290     return len;
1291 }
1292
1293
1294 // swap 32bit String to 32bit MB
1295 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1296 {
1297     size_t len=0;
1298
1299     while (*psz && (!buf || len < n))
1300     {
1301         if (buf)
1302         {
1303             *buf++ = ((char *)psz)[3];
1304             *buf++ = ((char *)psz)[2];
1305             *buf++ = ((char *)psz)[1];
1306             *buf++ = ((char *)psz)[0];
1307         }
1308         len += sizeof(wxUint32);
1309         psz++;
1310     }
1311
1312     if (buf && len<=n-sizeof(wxUint32))
1313         *(wxUint32*)buf=0;
1314
1315     return len;
1316 }
1317
1318
1319 #endif // WC_UTF16
1320
1321
1322 // ============================================================================
1323 // The classes doing conversion using the iconv_xxx() functions
1324 // ============================================================================
1325
1326 #ifdef HAVE_ICONV
1327
1328 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1329 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1330 //     (unless there's yet another bug in glibc) the only case when iconv()
1331 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1332 //     left in the input buffer -- when _real_ error occurs,
1333 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1334 //     iconv() failure.
1335 //     [This bug does not appear in glibc 2.2.]
1336 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1337 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1338                                      (errno != E2BIG || bufLeft != 0))
1339 #else
1340 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1341 #endif
1342
1343 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1344
1345 #define ICONV_T_INVALID ((iconv_t)-1)
1346
1347 #if SIZEOF_WCHAR_T == 4
1348     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1349     #define WC_ENC      wxFONTENCODING_UTF32
1350 #elif SIZEOF_WCHAR_T == 2
1351     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1352     #define WC_ENC      wxFONTENCODING_UTF16
1353 #else // sizeof(wchar_t) != 2 nor 4
1354     // does this ever happen?
1355     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1356 #endif
1357
1358 // ----------------------------------------------------------------------------
1359 // wxMBConv_iconv: encapsulates an iconv character set
1360 // ----------------------------------------------------------------------------
1361
1362 class wxMBConv_iconv : public wxMBConv
1363 {
1364 public:
1365     wxMBConv_iconv(const wxChar *name);
1366     virtual ~wxMBConv_iconv();
1367
1368     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1369     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1370
1371     bool IsOk() const
1372         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1373
1374 protected:
1375     // the iconv handlers used to translate from multibyte to wide char and in
1376     // the other direction
1377     iconv_t m2w,
1378             w2m;
1379 #if wxUSE_THREADS
1380     // guards access to m2w and w2m objects
1381     wxMutex m_iconvMutex;
1382 #endif
1383
1384 private:
1385     // classify this encoding as explained in wxMBConv::GetMinMBCharWidth()
1386     // comment
1387     virtual size_t GetMinMBCharWidth() const;
1388
1389     // the name (for iconv_open()) of a wide char charset -- if none is
1390     // available on this machine, it will remain NULL
1391     static wxString ms_wcCharsetName;
1392
1393     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1394     // different endian-ness than the native one
1395     static bool ms_wcNeedsSwap;
1396
1397     // cached result of GetMinMBCharWidth(); set to 0 meaning "unknown"
1398     // initially
1399     size_t m_minMBCharWidth;
1400 };
1401
1402 // make the constructor available for unit testing
1403 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1404 {
1405     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1406     if ( !result->IsOk() )
1407     {
1408         delete result;
1409         return 0;
1410     }
1411     return result;
1412 }
1413
1414 wxString wxMBConv_iconv::ms_wcCharsetName;
1415 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1416
1417 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1418 {
1419     m_minMBCharWidth = 0;
1420
1421     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1422     // names for the charsets
1423     const wxCharBuffer cname(wxString(name).ToAscii());
1424
1425     // check for charset that represents wchar_t:
1426     if ( ms_wcCharsetName.empty() )
1427     {
1428         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1429
1430 #if wxUSE_FONTMAP
1431         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1432 #else // !wxUSE_FONTMAP
1433         static const wxChar *names[] =
1434         {
1435 #if SIZEOF_WCHAR_T == 4
1436             _T("UCS-4"),
1437 #elif SIZEOF_WCHAR_T = 2
1438             _T("UCS-2"),
1439 #endif
1440             NULL
1441         };
1442 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1443
1444         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1445         {
1446             const wxString nameCS(*names);
1447
1448             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1449             wxString nameXE(nameCS);
1450             #ifdef WORDS_BIGENDIAN
1451                 nameXE += _T("BE");
1452             #else // little endian
1453                 nameXE += _T("LE");
1454             #endif
1455
1456             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1457                        nameXE.c_str());
1458
1459             m2w = iconv_open(nameXE.ToAscii(), cname);
1460             if ( m2w == ICONV_T_INVALID )
1461             {
1462                 // try charset w/o bytesex info (e.g. "UCS4")
1463                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1464                            nameCS.c_str());
1465                 m2w = iconv_open(nameCS.ToAscii(), cname);
1466
1467                 // and check for bytesex ourselves:
1468                 if ( m2w != ICONV_T_INVALID )
1469                 {
1470                     char    buf[2], *bufPtr;
1471                     wchar_t wbuf[2], *wbufPtr;
1472                     size_t  insz, outsz;
1473                     size_t  res;
1474
1475                     buf[0] = 'A';
1476                     buf[1] = 0;
1477                     wbuf[0] = 0;
1478                     insz = 2;
1479                     outsz = SIZEOF_WCHAR_T * 2;
1480                     wbufPtr = wbuf;
1481                     bufPtr = buf;
1482
1483                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1484                                 (char**)&wbufPtr, &outsz);
1485
1486                     if (ICONV_FAILED(res, insz))
1487                     {
1488                         wxLogLastError(wxT("iconv"));
1489                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1490                                    nameCS.c_str());
1491                     }
1492                     else // ok, can convert to this encoding, remember it
1493                     {
1494                         ms_wcCharsetName = nameCS;
1495                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1496                     }
1497                 }
1498             }
1499             else // use charset not requiring byte swapping
1500             {
1501                 ms_wcCharsetName = nameXE;
1502             }
1503         }
1504
1505         wxLogTrace(TRACE_STRCONV,
1506                    wxT("iconv wchar_t charset is \"%s\"%s"),
1507                    ms_wcCharsetName.empty() ? _T("<none>")
1508                                             : ms_wcCharsetName.c_str(),
1509                    ms_wcNeedsSwap ? _T(" (needs swap)")
1510                                   : _T(""));
1511     }
1512     else // we already have ms_wcCharsetName
1513     {
1514         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1515     }
1516
1517     if ( ms_wcCharsetName.empty() )
1518     {
1519         w2m = ICONV_T_INVALID;
1520     }
1521     else
1522     {
1523         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1524         if ( w2m == ICONV_T_INVALID )
1525         {
1526             wxLogTrace(TRACE_STRCONV,
1527                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1528                        ms_wcCharsetName.c_str(), cname.data());
1529         }
1530     }
1531 }
1532
1533 wxMBConv_iconv::~wxMBConv_iconv()
1534 {
1535     if ( m2w != ICONV_T_INVALID )
1536         iconv_close(m2w);
1537     if ( w2m != ICONV_T_INVALID )
1538         iconv_close(w2m);
1539 }
1540
1541 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1542 {
1543     // find the string length: notice that must be done differently for
1544     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1545     size_t inbuf;
1546     const size_t nulLen = GetMinMBCharWidth();
1547     switch ( nulLen )
1548     {
1549         default:
1550             return (size_t)-1;
1551
1552         case 1:
1553             inbuf = strlen(psz); // arguably more optimized than our version
1554             break;
1555
1556         case 2:
1557         case 4:
1558             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1559             // they also have to start at character boundary and not span two
1560             // adjacent characters
1561             const char *p;
1562             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1563                 ;
1564             inbuf = p - psz;
1565             break;
1566     }
1567
1568 #if wxUSE_THREADS
1569     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1570     //     Unfortunately there is a couple of global wxCSConv objects such as
1571     //     wxConvLocal that are used all over wx code, so we have to make sure
1572     //     the handle is used by at most one thread at the time. Otherwise
1573     //     only a few wx classes would be safe to use from non-main threads
1574     //     as MB<->WC conversion would fail "randomly".
1575     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1576 #endif // wxUSE_THREADS
1577
1578
1579     size_t outbuf = n * SIZEOF_WCHAR_T;
1580     size_t res, cres;
1581     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1582     wchar_t *bufPtr = buf;
1583     const char *pszPtr = psz;
1584
1585     if (buf)
1586     {
1587         // have destination buffer, convert there
1588         cres = iconv(m2w,
1589                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1590                      (char**)&bufPtr, &outbuf);
1591         res = n - (outbuf / SIZEOF_WCHAR_T);
1592
1593         if (ms_wcNeedsSwap)
1594         {
1595             // convert to native endianness
1596             for ( unsigned i = 0; i < res; i++ )
1597                 buf[n] = WC_BSWAP(buf[i]);
1598         }
1599
1600         // NUL-terminate the string if there is any space left
1601         if (res < n)
1602             buf[res] = 0;
1603     }
1604     else
1605     {
1606         // no destination buffer... convert using temp buffer
1607         // to calculate destination buffer requirement
1608         wchar_t tbuf[8];
1609         res = 0;
1610         do {
1611             bufPtr = tbuf;
1612             outbuf = 8*SIZEOF_WCHAR_T;
1613
1614             cres = iconv(m2w,
1615                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1616                          (char**)&bufPtr, &outbuf );
1617
1618             res += 8-(outbuf/SIZEOF_WCHAR_T);
1619         } while ((cres==(size_t)-1) && (errno==E2BIG));
1620     }
1621
1622     if (ICONV_FAILED(cres, inbuf))
1623     {
1624         //VS: it is ok if iconv fails, hence trace only
1625         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1626         return (size_t)-1;
1627     }
1628
1629     return res;
1630 }
1631
1632 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1633 {
1634 #if wxUSE_THREADS
1635     // NB: explained in MB2WC
1636     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1637 #endif
1638
1639     size_t inlen = wxWcslen(psz);
1640     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1641     size_t outbuf = n;
1642     size_t res, cres;
1643
1644     wchar_t *tmpbuf = 0;
1645
1646     if (ms_wcNeedsSwap)
1647     {
1648         // need to copy to temp buffer to switch endianness
1649         // (doing WC_BSWAP twice on the original buffer won't help, as it
1650         //  could be in read-only memory, or be accessed in some other thread)
1651         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1652         for ( size_t i = 0; i < inlen; i++ )
1653             tmpbuf[n] = WC_BSWAP(psz[i]);
1654         tmpbuf[inlen] = L'\0';
1655         psz = tmpbuf;
1656     }
1657
1658     if (buf)
1659     {
1660         // have destination buffer, convert there
1661         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1662
1663         res = n-outbuf;
1664
1665         // NB: iconv was given only wcslen(psz) characters on input, and so
1666         //     it couldn't convert the trailing zero. Let's do it ourselves
1667         //     if there's some room left for it in the output buffer.
1668         if (res < n)
1669             buf[0] = 0;
1670     }
1671     else
1672     {
1673         // no destination buffer... convert using temp buffer
1674         // to calculate destination buffer requirement
1675         char tbuf[16];
1676         res = 0;
1677         do {
1678             buf = tbuf; outbuf = 16;
1679
1680             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1681
1682             res += 16 - outbuf;
1683         } while ((cres==(size_t)-1) && (errno==E2BIG));
1684     }
1685
1686     if (ms_wcNeedsSwap)
1687     {
1688         free(tmpbuf);
1689     }
1690
1691     if (ICONV_FAILED(cres, inbuf))
1692     {
1693         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1694         return (size_t)-1;
1695     }
1696
1697     return res;
1698 }
1699
1700 size_t wxMBConv_iconv::GetMinMBCharWidth() const
1701 {
1702     if ( m_minMBCharWidth == 0 )
1703     {
1704         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1705
1706 #if wxUSE_THREADS
1707         // NB: explained in MB2WC
1708         wxMutexLocker lock(self->m_iconvMutex);
1709 #endif
1710
1711         wchar_t *wnul = L"";
1712         char buf[8]; // should be enough for NUL in any encoding
1713         size_t inLen = sizeof(wchar_t),
1714                outLen = WXSIZEOF(buf);
1715         char *in = (char *)wnul;
1716         char *out = buf;
1717         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1718         {
1719             self->m_minMBCharWidth = (size_t)-1;
1720         }
1721         else // ok
1722         {
1723             self->m_minMBCharWidth = out - buf;
1724         }
1725     }
1726
1727     return m_minMBCharWidth;
1728 }
1729
1730 #endif // HAVE_ICONV
1731
1732
1733 // ============================================================================
1734 // Win32 conversion classes
1735 // ============================================================================
1736
1737 #ifdef wxHAVE_WIN32_MB2WC
1738
1739 // from utils.cpp
1740 #if wxUSE_FONTMAP
1741 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1742 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1743 #endif
1744
1745 class wxMBConv_win32 : public wxMBConv
1746 {
1747 public:
1748     wxMBConv_win32()
1749     {
1750         m_CodePage = CP_ACP;
1751         m_minMBCharWidth = 0;
1752     }
1753
1754 #if wxUSE_FONTMAP
1755     wxMBConv_win32(const wxChar* name)
1756     {
1757         m_CodePage = wxCharsetToCodepage(name);
1758         m_minMBCharWidth = 0;
1759     }
1760
1761     wxMBConv_win32(wxFontEncoding encoding)
1762     {
1763         m_CodePage = wxEncodingToCodepage(encoding);
1764         m_minMBCharWidth = 0;
1765     }
1766 #endif // wxUSE_FONTMAP
1767
1768     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1769     {
1770         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1771         // the behaviour is not compatible with the Unix version (using iconv)
1772         // and break the library itself, e.g. wxTextInputStream::NextChar()
1773         // wouldn't work if reading an incomplete MB char didn't result in an
1774         // error
1775         //
1776         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1777         // an error (tested under Windows Server 2003) and apparently it is
1778         // done on purpose, i.e. the function accepts any input in this case
1779         // and although I'd prefer to return error on ill-formed output, our
1780         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1781         // explicitly ill-formed according to RFC 2152) neither so we don't
1782         // even have any fallback here...
1783         //
1784         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1785         // Win XP or newer and if it is specified on older versions, conversion
1786         // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1787         // fails. So we can only use the flag on newer Windows versions.
1788         // Additionally, the flag is not supported by UTF7, symbol and CJK
1789         // encodings. See here:
1790         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1791         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1792         int flags = 0;
1793         if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1794              m_CodePage < 50000 &&
1795              IsAtLeastWin2kSP4() )
1796         {
1797             flags = MB_ERR_INVALID_CHARS;
1798         }
1799         else if ( m_CodePage == CP_UTF8 )
1800         {
1801             // Avoid round-trip in the special case of UTF-8 by using our
1802             // own UTF-8 conversion code:
1803             return wxMBConvUTF8().MB2WC(buf, psz, n);
1804         }
1805
1806         const size_t len = ::MultiByteToWideChar
1807                              (
1808                                 m_CodePage,     // code page
1809                                 flags,          // flags: fall on error
1810                                 psz,            // input string
1811                                 -1,             // its length (NUL-terminated)
1812                                 buf,            // output string
1813                                 buf ? n : 0     // size of output buffer
1814                              );
1815         if ( !len )
1816         {
1817             // function totally failed
1818             return (size_t)-1;
1819         }
1820
1821         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1822         // check if we succeeded, by doing a double trip:
1823         if ( !flags && buf )
1824         {
1825             const size_t mbLen = strlen(psz);
1826             wxCharBuffer mbBuf(mbLen);
1827             if ( ::WideCharToMultiByte
1828                    (
1829                       m_CodePage,
1830                       0,
1831                       buf,
1832                       -1,
1833                       mbBuf.data(),
1834                       mbLen + 1,        // size in bytes, not length
1835                       NULL,
1836                       NULL
1837                    ) == 0 ||
1838                   strcmp(mbBuf, psz) != 0 )
1839             {
1840                 // we didn't obtain the same thing we started from, hence
1841                 // the conversion was lossy and we consider that it failed
1842                 return (size_t)-1;
1843             }
1844         }
1845
1846         // note that it returns count of written chars for buf != NULL and size
1847         // of the needed buffer for buf == NULL so in either case the length of
1848         // the string (which never includes the terminating NUL) is one less
1849         return len - 1;
1850     }
1851
1852     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1853     {
1854         /*
1855             we have a problem here: by default, WideCharToMultiByte() may
1856             replace characters unrepresentable in the target code page with bad
1857             quality approximations such as turning "1/2" symbol (U+00BD) into
1858             "1" for the code pages which don't have it and we, obviously, want
1859             to avoid this at any price
1860
1861             the trouble is that this function does it _silently_, i.e. it won't
1862             even tell us whether it did or not... Win98/2000 and higher provide
1863             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1864             we have to resort to a round trip, i.e. check that converting back
1865             results in the same string -- this is, of course, expensive but
1866             otherwise we simply can't be sure to not garble the data.
1867          */
1868
1869         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1870         // it doesn't work with CJK encodings (which we test for rather roughly
1871         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1872         // supporting it
1873         BOOL usedDef wxDUMMY_INITIALIZE(false);
1874         BOOL *pUsedDef;
1875         int flags;
1876         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1877         {
1878             // it's our lucky day
1879             flags = WC_NO_BEST_FIT_CHARS;
1880             pUsedDef = &usedDef;
1881         }
1882         else // old system or unsupported encoding
1883         {
1884             flags = 0;
1885             pUsedDef = NULL;
1886         }
1887
1888         const size_t len = ::WideCharToMultiByte
1889                              (
1890                                 m_CodePage,     // code page
1891                                 flags,          // either none or no best fit
1892                                 pwz,            // input string
1893                                 -1,             // it is (wide) NUL-terminated
1894                                 buf,            // output buffer
1895                                 buf ? n : 0,    // and its size
1896                                 NULL,           // default "replacement" char
1897                                 pUsedDef        // [out] was it used?
1898                              );
1899
1900         if ( !len )
1901         {
1902             // function totally failed
1903             return (size_t)-1;
1904         }
1905
1906         // if we were really converting, check if we succeeded
1907         if ( buf )
1908         {
1909             if ( flags )
1910             {
1911                 // check if the conversion failed, i.e. if any replacements
1912                 // were done
1913                 if ( usedDef )
1914                     return (size_t)-1;
1915             }
1916             else // we must resort to double tripping...
1917             {
1918                 wxWCharBuffer wcBuf(n);
1919                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1920                         wcscmp(wcBuf, pwz) != 0 )
1921                 {
1922                     // we didn't obtain the same thing we started from, hence
1923                     // the conversion was lossy and we consider that it failed
1924                     return (size_t)-1;
1925                 }
1926             }
1927         }
1928
1929         // see the comment above for the reason of "len - 1"
1930         return len - 1;
1931     }
1932
1933     bool IsOk() const { return m_CodePage != -1; }
1934
1935 private:
1936     static bool CanUseNoBestFit()
1937     {
1938         static int s_isWin98Or2k = -1;
1939
1940         if ( s_isWin98Or2k == -1 )
1941         {
1942             int verMaj, verMin;
1943             switch ( wxGetOsVersion(&verMaj, &verMin) )
1944             {
1945                 case wxWIN95:
1946                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1947                     break;
1948
1949                 case wxWINDOWS_NT:
1950                     s_isWin98Or2k = verMaj >= 5;
1951                     break;
1952
1953                 default:
1954                     // unknown, be conseravtive by default
1955                     s_isWin98Or2k = 0;
1956             }
1957
1958             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1959         }
1960
1961         return s_isWin98Or2k == 1;
1962     }
1963
1964     static bool IsAtLeastWin2kSP4()
1965     {
1966 #ifdef __WXWINCE__
1967         return false;
1968 #else
1969         static int s_isAtLeastWin2kSP4 = -1;
1970
1971         if ( s_isAtLeastWin2kSP4 == -1 )
1972         {
1973             OSVERSIONINFOEX ver;
1974
1975             memset(&ver, 0, sizeof(ver));
1976             ver.dwOSVersionInfoSize = sizeof(ver);
1977             GetVersionEx((OSVERSIONINFO*)&ver);
1978
1979             s_isAtLeastWin2kSP4 =
1980               ((ver.dwMajorVersion > 5) || // Vista+
1981                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
1982                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
1983                ver.wServicePackMajor >= 4)) // 2000 SP4+
1984               ? 1 : 0;
1985         }
1986
1987         return s_isAtLeastWin2kSP4 == 1;
1988 #endif
1989     }
1990
1991     virtual size_t GetMinMBCharWidth() const
1992     {
1993         if ( m_minMBCharWidth == 0 )
1994         {
1995             int len = ::WideCharToMultiByte
1996                         (
1997                             m_CodePage,     // code page
1998                             0,              // no flags
1999                             L"",            // input string
2000                             1,              // translate just the NUL
2001                             NULL,           // output buffer
2002                             0,              // and its size
2003                             NULL,           // no replacement char
2004                             NULL            // [out] don't care if it was used
2005                         );
2006
2007             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2008             switch ( len )
2009             {
2010                 default:
2011                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2012                     // fall through
2013
2014                 case 0:
2015                     self->m_minMBCharWidth = (size_t)-1;
2016                     break;
2017
2018                 case 1:
2019                 case 2:
2020                 case 4:
2021                     self->m_minMBCharWidth = len;
2022                     break;
2023             }
2024         }
2025
2026         return m_minMBCharWidth;
2027     }
2028
2029     // the code page we're working with
2030     long m_CodePage;
2031
2032     // cached result of GetMinMBCharWidth(), set to 0 initially meaning
2033     // "unknown"
2034     size_t m_minMBCharWidth;
2035 };
2036
2037 #endif // wxHAVE_WIN32_MB2WC
2038
2039 // ============================================================================
2040 // Cocoa conversion classes
2041 // ============================================================================
2042
2043 #if defined(__WXCOCOA__)
2044
2045 // RN:  There is no UTF-32 support in either Core Foundation or
2046 // Cocoa.  Strangely enough, internally Core Foundation uses
2047 // UTF 32 internally quite a bit - its just not public (yet).
2048
2049 #include <CoreFoundation/CFString.h>
2050 #include <CoreFoundation/CFStringEncodingExt.h>
2051
2052 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2053 {
2054     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2055     if ( encoding == wxFONTENCODING_DEFAULT )
2056     {
2057         enc = CFStringGetSystemEncoding();
2058     }
2059     else switch( encoding)
2060     {
2061         case wxFONTENCODING_ISO8859_1 :
2062             enc = kCFStringEncodingISOLatin1 ;
2063             break ;
2064         case wxFONTENCODING_ISO8859_2 :
2065             enc = kCFStringEncodingISOLatin2;
2066             break ;
2067         case wxFONTENCODING_ISO8859_3 :
2068             enc = kCFStringEncodingISOLatin3 ;
2069             break ;
2070         case wxFONTENCODING_ISO8859_4 :
2071             enc = kCFStringEncodingISOLatin4;
2072             break ;
2073         case wxFONTENCODING_ISO8859_5 :
2074             enc = kCFStringEncodingISOLatinCyrillic;
2075             break ;
2076         case wxFONTENCODING_ISO8859_6 :
2077             enc = kCFStringEncodingISOLatinArabic;
2078             break ;
2079         case wxFONTENCODING_ISO8859_7 :
2080             enc = kCFStringEncodingISOLatinGreek;
2081             break ;
2082         case wxFONTENCODING_ISO8859_8 :
2083             enc = kCFStringEncodingISOLatinHebrew;
2084             break ;
2085         case wxFONTENCODING_ISO8859_9 :
2086             enc = kCFStringEncodingISOLatin5;
2087             break ;
2088         case wxFONTENCODING_ISO8859_10 :
2089             enc = kCFStringEncodingISOLatin6;
2090             break ;
2091         case wxFONTENCODING_ISO8859_11 :
2092             enc = kCFStringEncodingISOLatinThai;
2093             break ;
2094         case wxFONTENCODING_ISO8859_13 :
2095             enc = kCFStringEncodingISOLatin7;
2096             break ;
2097         case wxFONTENCODING_ISO8859_14 :
2098             enc = kCFStringEncodingISOLatin8;
2099             break ;
2100         case wxFONTENCODING_ISO8859_15 :
2101             enc = kCFStringEncodingISOLatin9;
2102             break ;
2103
2104         case wxFONTENCODING_KOI8 :
2105             enc = kCFStringEncodingKOI8_R;
2106             break ;
2107         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2108             enc = kCFStringEncodingDOSRussian;
2109             break ;
2110
2111 //      case wxFONTENCODING_BULGARIAN :
2112 //          enc = ;
2113 //          break ;
2114
2115         case wxFONTENCODING_CP437 :
2116             enc =kCFStringEncodingDOSLatinUS ;
2117             break ;
2118         case wxFONTENCODING_CP850 :
2119             enc = kCFStringEncodingDOSLatin1;
2120             break ;
2121         case wxFONTENCODING_CP852 :
2122             enc = kCFStringEncodingDOSLatin2;
2123             break ;
2124         case wxFONTENCODING_CP855 :
2125             enc = kCFStringEncodingDOSCyrillic;
2126             break ;
2127         case wxFONTENCODING_CP866 :
2128             enc =kCFStringEncodingDOSRussian ;
2129             break ;
2130         case wxFONTENCODING_CP874 :
2131             enc = kCFStringEncodingDOSThai;
2132             break ;
2133         case wxFONTENCODING_CP932 :
2134             enc = kCFStringEncodingDOSJapanese;
2135             break ;
2136         case wxFONTENCODING_CP936 :
2137             enc =kCFStringEncodingDOSChineseSimplif ;
2138             break ;
2139         case wxFONTENCODING_CP949 :
2140             enc = kCFStringEncodingDOSKorean;
2141             break ;
2142         case wxFONTENCODING_CP950 :
2143             enc = kCFStringEncodingDOSChineseTrad;
2144             break ;
2145         case wxFONTENCODING_CP1250 :
2146             enc = kCFStringEncodingWindowsLatin2;
2147             break ;
2148         case wxFONTENCODING_CP1251 :
2149             enc =kCFStringEncodingWindowsCyrillic ;
2150             break ;
2151         case wxFONTENCODING_CP1252 :
2152             enc =kCFStringEncodingWindowsLatin1 ;
2153             break ;
2154         case wxFONTENCODING_CP1253 :
2155             enc = kCFStringEncodingWindowsGreek;
2156             break ;
2157         case wxFONTENCODING_CP1254 :
2158             enc = kCFStringEncodingWindowsLatin5;
2159             break ;
2160         case wxFONTENCODING_CP1255 :
2161             enc =kCFStringEncodingWindowsHebrew ;
2162             break ;
2163         case wxFONTENCODING_CP1256 :
2164             enc =kCFStringEncodingWindowsArabic ;
2165             break ;
2166         case wxFONTENCODING_CP1257 :
2167             enc = kCFStringEncodingWindowsBalticRim;
2168             break ;
2169 //   This only really encodes to UTF7 (if that) evidently
2170 //        case wxFONTENCODING_UTF7 :
2171 //            enc = kCFStringEncodingNonLossyASCII ;
2172 //            break ;
2173         case wxFONTENCODING_UTF8 :
2174             enc = kCFStringEncodingUTF8 ;
2175             break ;
2176         case wxFONTENCODING_EUC_JP :
2177             enc = kCFStringEncodingEUC_JP;
2178             break ;
2179         case wxFONTENCODING_UTF16 :
2180             enc = kCFStringEncodingUnicode ;
2181             break ;
2182         case wxFONTENCODING_MACROMAN :
2183             enc = kCFStringEncodingMacRoman ;
2184             break ;
2185         case wxFONTENCODING_MACJAPANESE :
2186             enc = kCFStringEncodingMacJapanese ;
2187             break ;
2188         case wxFONTENCODING_MACCHINESETRAD :
2189             enc = kCFStringEncodingMacChineseTrad ;
2190             break ;
2191         case wxFONTENCODING_MACKOREAN :
2192             enc = kCFStringEncodingMacKorean ;
2193             break ;
2194         case wxFONTENCODING_MACARABIC :
2195             enc = kCFStringEncodingMacArabic ;
2196             break ;
2197         case wxFONTENCODING_MACHEBREW :
2198             enc = kCFStringEncodingMacHebrew ;
2199             break ;
2200         case wxFONTENCODING_MACGREEK :
2201             enc = kCFStringEncodingMacGreek ;
2202             break ;
2203         case wxFONTENCODING_MACCYRILLIC :
2204             enc = kCFStringEncodingMacCyrillic ;
2205             break ;
2206         case wxFONTENCODING_MACDEVANAGARI :
2207             enc = kCFStringEncodingMacDevanagari ;
2208             break ;
2209         case wxFONTENCODING_MACGURMUKHI :
2210             enc = kCFStringEncodingMacGurmukhi ;
2211             break ;
2212         case wxFONTENCODING_MACGUJARATI :
2213             enc = kCFStringEncodingMacGujarati ;
2214             break ;
2215         case wxFONTENCODING_MACORIYA :
2216             enc = kCFStringEncodingMacOriya ;
2217             break ;
2218         case wxFONTENCODING_MACBENGALI :
2219             enc = kCFStringEncodingMacBengali ;
2220             break ;
2221         case wxFONTENCODING_MACTAMIL :
2222             enc = kCFStringEncodingMacTamil ;
2223             break ;
2224         case wxFONTENCODING_MACTELUGU :
2225             enc = kCFStringEncodingMacTelugu ;
2226             break ;
2227         case wxFONTENCODING_MACKANNADA :
2228             enc = kCFStringEncodingMacKannada ;
2229             break ;
2230         case wxFONTENCODING_MACMALAJALAM :
2231             enc = kCFStringEncodingMacMalayalam ;
2232             break ;
2233         case wxFONTENCODING_MACSINHALESE :
2234             enc = kCFStringEncodingMacSinhalese ;
2235             break ;
2236         case wxFONTENCODING_MACBURMESE :
2237             enc = kCFStringEncodingMacBurmese ;
2238             break ;
2239         case wxFONTENCODING_MACKHMER :
2240             enc = kCFStringEncodingMacKhmer ;
2241             break ;
2242         case wxFONTENCODING_MACTHAI :
2243             enc = kCFStringEncodingMacThai ;
2244             break ;
2245         case wxFONTENCODING_MACLAOTIAN :
2246             enc = kCFStringEncodingMacLaotian ;
2247             break ;
2248         case wxFONTENCODING_MACGEORGIAN :
2249             enc = kCFStringEncodingMacGeorgian ;
2250             break ;
2251         case wxFONTENCODING_MACARMENIAN :
2252             enc = kCFStringEncodingMacArmenian ;
2253             break ;
2254         case wxFONTENCODING_MACCHINESESIMP :
2255             enc = kCFStringEncodingMacChineseSimp ;
2256             break ;
2257         case wxFONTENCODING_MACTIBETAN :
2258             enc = kCFStringEncodingMacTibetan ;
2259             break ;
2260         case wxFONTENCODING_MACMONGOLIAN :
2261             enc = kCFStringEncodingMacMongolian ;
2262             break ;
2263         case wxFONTENCODING_MACETHIOPIC :
2264             enc = kCFStringEncodingMacEthiopic ;
2265             break ;
2266         case wxFONTENCODING_MACCENTRALEUR :
2267             enc = kCFStringEncodingMacCentralEurRoman ;
2268             break ;
2269         case wxFONTENCODING_MACVIATNAMESE :
2270             enc = kCFStringEncodingMacVietnamese ;
2271             break ;
2272         case wxFONTENCODING_MACARABICEXT :
2273             enc = kCFStringEncodingMacExtArabic ;
2274             break ;
2275         case wxFONTENCODING_MACSYMBOL :
2276             enc = kCFStringEncodingMacSymbol ;
2277             break ;
2278         case wxFONTENCODING_MACDINGBATS :
2279             enc = kCFStringEncodingMacDingbats ;
2280             break ;
2281         case wxFONTENCODING_MACTURKISH :
2282             enc = kCFStringEncodingMacTurkish ;
2283             break ;
2284         case wxFONTENCODING_MACCROATIAN :
2285             enc = kCFStringEncodingMacCroatian ;
2286             break ;
2287         case wxFONTENCODING_MACICELANDIC :
2288             enc = kCFStringEncodingMacIcelandic ;
2289             break ;
2290         case wxFONTENCODING_MACROMANIAN :
2291             enc = kCFStringEncodingMacRomanian ;
2292             break ;
2293         case wxFONTENCODING_MACCELTIC :
2294             enc = kCFStringEncodingMacCeltic ;
2295             break ;
2296         case wxFONTENCODING_MACGAELIC :
2297             enc = kCFStringEncodingMacGaelic ;
2298             break ;
2299 //      case wxFONTENCODING_MACKEYBOARD :
2300 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2301 //          break ;
2302         default :
2303             // because gcc is picky
2304             break ;
2305     } ;
2306     return enc ;
2307 }
2308
2309 class wxMBConv_cocoa : public wxMBConv
2310 {
2311 public:
2312     wxMBConv_cocoa()
2313     {
2314         Init(CFStringGetSystemEncoding()) ;
2315     }
2316
2317 #if wxUSE_FONTMAP
2318     wxMBConv_cocoa(const wxChar* name)
2319     {
2320         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2321     }
2322 #endif
2323
2324     wxMBConv_cocoa(wxFontEncoding encoding)
2325     {
2326         Init( wxCFStringEncFromFontEnc(encoding) );
2327     }
2328
2329     ~wxMBConv_cocoa()
2330     {
2331     }
2332
2333     void Init( CFStringEncoding encoding)
2334     {
2335         m_encoding = encoding ;
2336     }
2337
2338     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2339     {
2340         wxASSERT(szUnConv);
2341
2342         CFStringRef theString = CFStringCreateWithBytes (
2343                                                 NULL, //the allocator
2344                                                 (const UInt8*)szUnConv,
2345                                                 strlen(szUnConv),
2346                                                 m_encoding,
2347                                                 false //no BOM/external representation
2348                                                 );
2349
2350         wxASSERT(theString);
2351
2352         size_t nOutLength = CFStringGetLength(theString);
2353
2354         if (szOut == NULL)
2355         {
2356             CFRelease(theString);
2357             return nOutLength;
2358         }
2359
2360         CFRange theRange = { 0, nOutSize };
2361
2362 #if SIZEOF_WCHAR_T == 4
2363         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2364 #endif
2365
2366         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2367
2368         CFRelease(theString);
2369
2370         szUniCharBuffer[nOutLength] = '\0' ;
2371
2372 #if SIZEOF_WCHAR_T == 4
2373         wxMBConvUTF16 converter ;
2374         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2375         delete[] szUniCharBuffer;
2376 #endif
2377
2378         return nOutLength;
2379     }
2380
2381     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2382     {
2383         wxASSERT(szUnConv);
2384
2385         size_t nRealOutSize;
2386         size_t nBufSize = wxWcslen(szUnConv);
2387         UniChar* szUniBuffer = (UniChar*) szUnConv;
2388
2389 #if SIZEOF_WCHAR_T == 4
2390         wxMBConvUTF16 converter ;
2391         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2392         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2393         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2394         nBufSize /= sizeof(UniChar);
2395 #endif
2396
2397         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2398                                 NULL, //allocator
2399                                 szUniBuffer,
2400                                 nBufSize,
2401                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2402                             );
2403
2404         wxASSERT(theString);
2405
2406         //Note that CER puts a BOM when converting to unicode
2407         //so we  check and use getchars instead in that case
2408         if (m_encoding == kCFStringEncodingUnicode)
2409         {
2410             if (szOut != NULL)
2411                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2412
2413             nRealOutSize = CFStringGetLength(theString) + 1;
2414         }
2415         else
2416         {
2417             CFStringGetBytes(
2418                 theString,
2419                 CFRangeMake(0, CFStringGetLength(theString)),
2420                 m_encoding,
2421                 0, //what to put in characters that can't be converted -
2422                     //0 tells CFString to return NULL if it meets such a character
2423                 false, //not an external representation
2424                 (UInt8*) szOut,
2425                 nOutSize,
2426                 (CFIndex*) &nRealOutSize
2427                         );
2428         }
2429
2430         CFRelease(theString);
2431
2432 #if SIZEOF_WCHAR_T == 4
2433         delete[] szUniBuffer;
2434 #endif
2435
2436         return  nRealOutSize - 1;
2437     }
2438
2439     bool IsOk() const
2440     {
2441         return m_encoding != kCFStringEncodingInvalidId &&
2442               CFStringIsEncodingAvailable(m_encoding);
2443     }
2444
2445 private:
2446     CFStringEncoding m_encoding ;
2447 };
2448
2449 #endif // defined(__WXCOCOA__)
2450
2451 // ============================================================================
2452 // Mac conversion classes
2453 // ============================================================================
2454
2455 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2456
2457 class wxMBConv_mac : public wxMBConv
2458 {
2459 public:
2460     wxMBConv_mac()
2461     {
2462         Init(CFStringGetSystemEncoding()) ;
2463     }
2464
2465 #if wxUSE_FONTMAP
2466     wxMBConv_mac(const wxChar* name)
2467     {
2468         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2469     }
2470 #endif
2471
2472     wxMBConv_mac(wxFontEncoding encoding)
2473     {
2474         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2475     }
2476
2477     ~wxMBConv_mac()
2478     {
2479         OSStatus status = noErr ;
2480         status = TECDisposeConverter(m_MB2WC_converter);
2481         status = TECDisposeConverter(m_WC2MB_converter);
2482     }
2483
2484
2485     void Init( TextEncodingBase encoding)
2486     {
2487         OSStatus status = noErr ;
2488         m_char_encoding = encoding ;
2489         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2490
2491         status = TECCreateConverter(&m_MB2WC_converter,
2492                                     m_char_encoding,
2493                                     m_unicode_encoding);
2494         status = TECCreateConverter(&m_WC2MB_converter,
2495                                     m_unicode_encoding,
2496                                     m_char_encoding);
2497     }
2498
2499     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2500     {
2501         OSStatus status = noErr ;
2502         ByteCount byteOutLen ;
2503         ByteCount byteInLen = strlen(psz) ;
2504         wchar_t *tbuf = NULL ;
2505         UniChar* ubuf = NULL ;
2506         size_t res = 0 ;
2507
2508         if (buf == NULL)
2509         {
2510             //apple specs say at least 32
2511             n = wxMax( 32 , byteInLen ) ;
2512             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2513         }
2514         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2515 #if SIZEOF_WCHAR_T == 4
2516         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2517 #else
2518         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2519 #endif
2520         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2521           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2522 #if SIZEOF_WCHAR_T == 4
2523         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2524         // is not properly terminated we get random characters at the end
2525         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2526         wxMBConvUTF16 converter ;
2527         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2528         free( ubuf ) ;
2529 #else
2530         res = byteOutLen / sizeof( UniChar ) ;
2531 #endif
2532         if ( buf == NULL )
2533              free(tbuf) ;
2534
2535         if ( buf  && res < n)
2536             buf[res] = 0;
2537
2538         return res ;
2539     }
2540
2541     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2542     {
2543         OSStatus status = noErr ;
2544         ByteCount byteOutLen ;
2545         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2546
2547         char *tbuf = NULL ;
2548
2549         if (buf == NULL)
2550         {
2551             //apple specs say at least 32
2552             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2553             tbuf = (char*) malloc( n ) ;
2554         }
2555
2556         ByteCount byteBufferLen = n ;
2557         UniChar* ubuf = NULL ;
2558 #if SIZEOF_WCHAR_T == 4
2559         wxMBConvUTF16 converter ;
2560         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2561         byteInLen = unicharlen ;
2562         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2563         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2564 #else
2565         ubuf = (UniChar*) psz ;
2566 #endif
2567         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2568             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2569 #if SIZEOF_WCHAR_T == 4
2570         free( ubuf ) ;
2571 #endif
2572         if ( buf == NULL )
2573             free(tbuf) ;
2574
2575         size_t res = byteOutLen ;
2576         if ( buf  && res < n)
2577         {
2578             buf[res] = 0;
2579
2580             //we need to double-trip to verify it didn't insert any ? in place
2581             //of bogus characters
2582             wxWCharBuffer wcBuf(n);
2583             size_t pszlen = wxWcslen(psz);
2584             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2585                         wxWcslen(wcBuf) != pszlen ||
2586                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2587             {
2588                 // we didn't obtain the same thing we started from, hence
2589                 // the conversion was lossy and we consider that it failed
2590                 return (size_t)-1;
2591             }
2592         }
2593
2594         return res ;
2595     }
2596
2597     bool IsOk() const
2598         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2599
2600 private:
2601     TECObjectRef m_MB2WC_converter ;
2602     TECObjectRef m_WC2MB_converter ;
2603
2604     TextEncodingBase m_char_encoding ;
2605     TextEncodingBase m_unicode_encoding ;
2606 };
2607
2608 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2609
2610 // ============================================================================
2611 // wxEncodingConverter based conversion classes
2612 // ============================================================================
2613
2614 #if wxUSE_FONTMAP
2615
2616 class wxMBConv_wxwin : public wxMBConv
2617 {
2618 private:
2619     void Init()
2620     {
2621         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2622                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2623     }
2624
2625 public:
2626     // temporarily just use wxEncodingConverter stuff,
2627     // so that it works while a better implementation is built
2628     wxMBConv_wxwin(const wxChar* name)
2629     {
2630         if (name)
2631             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2632         else
2633             m_enc = wxFONTENCODING_SYSTEM;
2634
2635         Init();
2636     }
2637
2638     wxMBConv_wxwin(wxFontEncoding enc)
2639     {
2640         m_enc = enc;
2641
2642         Init();
2643     }
2644
2645     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2646     {
2647         size_t inbuf = strlen(psz);
2648         if (buf)
2649         {
2650             if (!m2w.Convert(psz,buf))
2651                 return (size_t)-1;
2652         }
2653         return inbuf;
2654     }
2655
2656     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2657     {
2658         const size_t inbuf = wxWcslen(psz);
2659         if (buf)
2660         {
2661             if (!w2m.Convert(psz,buf))
2662                 return (size_t)-1;
2663         }
2664
2665         return inbuf;
2666     }
2667
2668     bool IsOk() const { return m_ok; }
2669
2670 public:
2671     wxFontEncoding m_enc;
2672     wxEncodingConverter m2w, w2m;
2673
2674 private:
2675     virtual size_t GetMinMBCharWidth() const
2676     {
2677         switch ( m_enc )
2678         {
2679             case wxFONTENCODING_UTF16BE:
2680             case wxFONTENCODING_UTF16LE:
2681                 return 2;
2682
2683             case wxFONTENCODING_UTF32BE:
2684             case wxFONTENCODING_UTF32LE:
2685                 return 4;
2686
2687             default:
2688                 return 1;
2689         }
2690     }
2691
2692     // were we initialized successfully?
2693     bool m_ok;
2694
2695     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2696 };
2697
2698 // make the constructors available for unit testing
2699 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2700 {
2701     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2702     if ( !result->IsOk() )
2703     {
2704         delete result;
2705         return 0;
2706     }
2707     return result;
2708 }
2709
2710 #endif // wxUSE_FONTMAP
2711
2712 // ============================================================================
2713 // wxCSConv implementation
2714 // ============================================================================
2715
2716 void wxCSConv::Init()
2717 {
2718     m_name = NULL;
2719     m_convReal =  NULL;
2720     m_deferred = true;
2721 }
2722
2723 wxCSConv::wxCSConv(const wxChar *charset)
2724 {
2725     Init();
2726
2727     if ( charset )
2728     {
2729         SetName(charset);
2730     }
2731
2732 #if wxUSE_FONTMAP
2733     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2734 #else
2735     m_encoding = wxFONTENCODING_SYSTEM;
2736 #endif
2737 }
2738
2739 wxCSConv::wxCSConv(wxFontEncoding encoding)
2740 {
2741     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2742     {
2743         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2744
2745         encoding = wxFONTENCODING_SYSTEM;
2746     }
2747
2748     Init();
2749
2750     m_encoding = encoding;
2751 }
2752
2753 wxCSConv::~wxCSConv()
2754 {
2755     Clear();
2756 }
2757
2758 wxCSConv::wxCSConv(const wxCSConv& conv)
2759         : wxMBConv()
2760 {
2761     Init();
2762
2763     SetName(conv.m_name);
2764     m_encoding = conv.m_encoding;
2765 }
2766
2767 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2768 {
2769     Clear();
2770
2771     SetName(conv.m_name);
2772     m_encoding = conv.m_encoding;
2773
2774     return *this;
2775 }
2776
2777 void wxCSConv::Clear()
2778 {
2779     free(m_name);
2780     delete m_convReal;
2781
2782     m_name = NULL;
2783     m_convReal = NULL;
2784 }
2785
2786 void wxCSConv::SetName(const wxChar *charset)
2787 {
2788     if (charset)
2789     {
2790         m_name = wxStrdup(charset);
2791         m_deferred = true;
2792     }
2793 }
2794
2795 #if wxUSE_FONTMAP
2796 #include "wx/hashmap.h"
2797
2798 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2799                      wxEncodingNameCache );
2800
2801 static wxEncodingNameCache gs_nameCache;
2802 #endif
2803
2804 wxMBConv *wxCSConv::DoCreate() const
2805 {
2806 #if wxUSE_FONTMAP
2807     wxLogTrace(TRACE_STRCONV,
2808                wxT("creating conversion for %s"),
2809                (m_name ? m_name
2810                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2811 #endif // wxUSE_FONTMAP
2812
2813     // check for the special case of ASCII or ISO8859-1 charset: as we have
2814     // special knowledge of it anyhow, we don't need to create a special
2815     // conversion object
2816     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2817             m_encoding == wxFONTENCODING_DEFAULT )
2818     {
2819         // don't convert at all
2820         return NULL;
2821     }
2822
2823     // we trust OS to do conversion better than we can so try external
2824     // conversion methods first
2825     //
2826     // the full order is:
2827     //      1. OS conversion (iconv() under Unix or Win32 API)
2828     //      2. hard coded conversions for UTF
2829     //      3. wxEncodingConverter as fall back
2830
2831     // step (1)
2832 #ifdef HAVE_ICONV
2833 #if !wxUSE_FONTMAP
2834     if ( m_name )
2835 #endif // !wxUSE_FONTMAP
2836     {
2837         wxString name(m_name);
2838         wxFontEncoding encoding(m_encoding);
2839
2840         if ( !name.empty() )
2841         {
2842             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2843             if ( conv->IsOk() )
2844                 return conv;
2845
2846             delete conv;
2847
2848 #if wxUSE_FONTMAP
2849             encoding =
2850                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2851 #endif // wxUSE_FONTMAP
2852         }
2853 #if wxUSE_FONTMAP
2854         {
2855             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2856             if ( it != gs_nameCache.end() )
2857             {
2858                 if ( it->second.empty() )
2859                     return NULL;
2860
2861                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2862                 if ( conv->IsOk() )
2863                     return conv;
2864
2865                 delete conv;
2866             }
2867
2868             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2869
2870             for ( ; *names; ++names )
2871             {
2872                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2873                 if ( conv->IsOk() )
2874                 {
2875                     gs_nameCache[encoding] = *names;
2876                     return conv;
2877                 }
2878
2879                 delete conv;
2880             }
2881
2882             gs_nameCache[encoding] = _T(""); // cache the failure
2883         }
2884 #endif // wxUSE_FONTMAP
2885     }
2886 #endif // HAVE_ICONV
2887
2888 #ifdef wxHAVE_WIN32_MB2WC
2889     {
2890 #if wxUSE_FONTMAP
2891         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2892                                       : new wxMBConv_win32(m_encoding);
2893         if ( conv->IsOk() )
2894             return conv;
2895
2896         delete conv;
2897 #else
2898         return NULL;
2899 #endif
2900     }
2901 #endif // wxHAVE_WIN32_MB2WC
2902 #if defined(__WXMAC__)
2903     {
2904         // leave UTF16 and UTF32 to the built-ins of wx
2905         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2906             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2907         {
2908
2909 #if wxUSE_FONTMAP
2910             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2911                                         : new wxMBConv_mac(m_encoding);
2912 #else
2913             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2914 #endif
2915             if ( conv->IsOk() )
2916                  return conv;
2917
2918             delete conv;
2919         }
2920     }
2921 #endif
2922 #if defined(__WXCOCOA__)
2923     {
2924         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2925         {
2926
2927 #if wxUSE_FONTMAP
2928             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2929                                           : new wxMBConv_cocoa(m_encoding);
2930 #else
2931             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2932 #endif
2933             if ( conv->IsOk() )
2934                  return conv;
2935
2936             delete conv;
2937         }
2938     }
2939 #endif
2940     // step (2)
2941     wxFontEncoding enc = m_encoding;
2942 #if wxUSE_FONTMAP
2943     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2944     {
2945         // use "false" to suppress interactive dialogs -- we can be called from
2946         // anywhere and popping up a dialog from here is the last thing we want to
2947         // do
2948         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2949     }
2950 #endif // wxUSE_FONTMAP
2951
2952     switch ( enc )
2953     {
2954         case wxFONTENCODING_UTF7:
2955              return new wxMBConvUTF7;
2956
2957         case wxFONTENCODING_UTF8:
2958              return new wxMBConvUTF8;
2959
2960         case wxFONTENCODING_UTF16BE:
2961              return new wxMBConvUTF16BE;
2962
2963         case wxFONTENCODING_UTF16LE:
2964              return new wxMBConvUTF16LE;
2965
2966         case wxFONTENCODING_UTF32BE:
2967              return new wxMBConvUTF32BE;
2968
2969         case wxFONTENCODING_UTF32LE:
2970              return new wxMBConvUTF32LE;
2971
2972         default:
2973              // nothing to do but put here to suppress gcc warnings
2974              ;
2975     }
2976
2977     // step (3)
2978 #if wxUSE_FONTMAP
2979     {
2980         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2981                                       : new wxMBConv_wxwin(m_encoding);
2982         if ( conv->IsOk() )
2983             return conv;
2984
2985         delete conv;
2986     }
2987 #endif // wxUSE_FONTMAP
2988
2989     // NB: This is a hack to prevent deadlock. What could otherwise happen
2990     //     in Unicode build: wxConvLocal creation ends up being here
2991     //     because of some failure and logs the error. But wxLog will try to
2992     //     attach timestamp, for which it will need wxConvLocal (to convert
2993     //     time to char* and then wchar_t*), but that fails, tries to log
2994     //     error, but wxLog has a (already locked) critical section that
2995     //     guards static buffer.
2996     static bool alreadyLoggingError = false;
2997     if (!alreadyLoggingError)
2998     {
2999         alreadyLoggingError = true;
3000         wxLogError(_("Cannot convert from the charset '%s'!"),
3001                    m_name ? m_name
3002                       :
3003 #if wxUSE_FONTMAP
3004                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3005 #else // !wxUSE_FONTMAP
3006                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3007 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3008               );
3009         alreadyLoggingError = false;
3010     }
3011
3012     return NULL;
3013 }
3014
3015 void wxCSConv::CreateConvIfNeeded() const
3016 {
3017     if ( m_deferred )
3018     {
3019         wxCSConv *self = (wxCSConv *)this; // const_cast
3020
3021 #if wxUSE_INTL
3022         // if we don't have neither the name nor the encoding, use the default
3023         // encoding for this system
3024         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3025         {
3026             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3027         }
3028 #endif // wxUSE_INTL
3029
3030         self->m_convReal = DoCreate();
3031         self->m_deferred = false;
3032     }
3033 }
3034
3035 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3036 {
3037     CreateConvIfNeeded();
3038
3039     if (m_convReal)
3040         return m_convReal->MB2WC(buf, psz, n);
3041
3042     // latin-1 (direct)
3043     size_t len = strlen(psz);
3044
3045     if (buf)
3046     {
3047         for (size_t c = 0; c <= len; c++)
3048             buf[c] = (unsigned char)(psz[c]);
3049     }
3050
3051     return len;
3052 }
3053
3054 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3055 {
3056     CreateConvIfNeeded();
3057
3058     if (m_convReal)
3059         return m_convReal->WC2MB(buf, psz, n);
3060
3061     // latin-1 (direct)
3062     const size_t len = wxWcslen(psz);
3063     if (buf)
3064     {
3065         for (size_t c = 0; c <= len; c++)
3066         {
3067             if (psz[c] > 0xFF)
3068                 return (size_t)-1;
3069             buf[c] = (char)psz[c];
3070         }
3071     }
3072     else
3073     {
3074         for (size_t c = 0; c <= len; c++)
3075         {
3076             if (psz[c] > 0xFF)
3077                 return (size_t)-1;
3078         }
3079     }
3080
3081     return len;
3082 }
3083
3084 size_t wxCSConv::GetMinMBCharWidth() const
3085 {
3086     CreateConvIfNeeded();
3087
3088     if ( m_convReal )
3089     {
3090         // cast needed just to call private function of m_convReal
3091         return ((wxCSConv *)m_convReal)->GetMinMBCharWidth();
3092     }
3093
3094     return 1;
3095 }
3096
3097 // ----------------------------------------------------------------------------
3098 // globals
3099 // ----------------------------------------------------------------------------
3100
3101 #ifdef __WINDOWS__
3102     static wxMBConv_win32 wxConvLibcObj;
3103 #elif defined(__WXMAC__) && !defined(__MACH__)
3104     static wxMBConv_mac wxConvLibcObj ;
3105 #else
3106     static wxMBConvLibc wxConvLibcObj;
3107 #endif
3108
3109 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3110 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3111 static wxMBConvUTF7 wxConvUTF7Obj;
3112 static wxMBConvUTF8 wxConvUTF8Obj;
3113
3114 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3115 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3116 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3117 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3118 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3119 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3120 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3121 #ifdef __WXOSX__
3122                                     wxConvUTF8Obj;
3123 #else
3124                                     wxConvLibcObj;
3125 #endif
3126
3127
3128 #else // !wxUSE_WCHAR_T
3129
3130 // stand-ins in absence of wchar_t
3131 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3132                                 wxConvISO8859_1,
3133                                 wxConvLocal,
3134                                 wxConvUTF8;
3135
3136 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T