src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // ----------------------------------------------------------------------------
  90 // UTF-16 en/decoding to/from UCS-4
  91 // ----------------------------------------------------------------------------
  92
  93
  94 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  95 {
  96     if (input<=0xffff)
  97     {
  98         if (output)
  99             *output = (wxUint16) input;
 100         return 1;
 101     }
 102     else if (input>=0x110000)
 103     {
 104         return (size_t)-1;
 105     }
 106     else
 107     {
 108         if (output)
 109         {
 110             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 111             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 112         }
 113         return 2;
 114     }
 115 }
 116
 117 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 118 {
 119     if ((*input<0xd800) || (*input>0xdfff))
 120     {
 121         output = *input;
 122         return 1;
 123     }
 124     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 125     {
 126         output = *input;
 127         return (size_t)-1;
 128     }
 129     else
 130     {
 131         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 132         return 2;
 133     }
 134 }
 135
 136
 137 // ----------------------------------------------------------------------------
 138 // wxMBConv
 139 // ----------------------------------------------------------------------------
 140
 141 wxMBConv::~wxMBConv()
 142 {
 143     // nothing to do here (necessary for Darwin linking probably)
 144 }
 145
 146 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 147 {
 148     if ( psz )
 149     {
 150         // calculate the length of the buffer needed first
 151         size_t nLen = MB2WC(NULL, psz, 0);
 152         if ( nLen != (size_t)-1 )
 153         {
 154             // now do the actual conversion
 155             wxWCharBuffer buf(nLen);
 156             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 157             if ( nLen != (size_t)-1 )
 158             {
 159                 return buf;
 160             }
 161         }
 162     }
 163
 164     wxWCharBuffer buf((wchar_t *)NULL);
 165
 166     return buf;
 167 }
 168
 169 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 170 {
 171     if ( pwz )
 172     {
 173         size_t nLen = WC2MB(NULL, pwz, 0);
 174         if ( nLen != (size_t)-1 )
 175         {
 176             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 177             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 178             if ( nLen != (size_t)-1 )
 179             {
 180                 return buf;
 181             }
 182         }
 183     }
 184
 185     wxCharBuffer buf((char *)NULL);
 186
 187     return buf;
 188 }
 189
 190 // helper of cMB2WC(): check if n bytes at this location are all NUL
 191 static bool NotAllNULs(const char *p, size_t n)
 192 {
 193     while ( n && *p++ == '\0' )
 194         n--;
 195
 196     return n != 0;
 197 }
 198
 199 const wxWCharBuffer
 200 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 201 {
 202     // the currently accumulated wide characters
 203     wxWCharBuffer wbuf;
 204
 205     // the current length of wbuf
 206     size_t lenBuf = 0;
 207
 208     // the number of NULs terminating this string
 209     size_t nulLen   wxDUMMY_INITIALIZE(0);
 210
 211     // make a copy of the input string unless it is already properly
 212     // NUL-terminated
 213     wxCharBuffer bufTmp;
 214
 215     // if we were not given the input size we just have to assume that the
 216     // string is properly terminated as we have no way of knowing how long it
 217     // is anyhow, but if we do have the size check whether there are enough
 218     // NULs at the end
 219     if ( inLen != (size_t)-1 )
 220     {
 221         // we need to know how to find the end of this string
 222         nulLen = GetMinMBCharWidth();
 223         if ( nulLen == (size_t)-1 )
 224             return wbuf;
 225
 226         // if there are enough NULs we can avoid the copy
 227         if ( inLen < nulLen || NotAllNULs(in + inLen - nulLen, nulLen) )
 228         {
 229             // make a copy in order to properly NUL-terminate the string
 230             bufTmp = wxCharBuffer(inLen + nulLen - 1 /* 1 will be added */);
 231             char * const p = bufTmp.data();
 232             memcpy(p, in, inLen);
 233             for ( char *s = p + inLen; s < p + inLen + nulLen; s++ )
 234                 *s = '\0';
 235         }
 236     }
 237
 238     if ( bufTmp )
 239         in = bufTmp;
 240
 241     size_t lenChunk;
 242     for ( const char * const inEnd = in + inLen;; )
 243     {
 244         // try to convert the current chunk
 245         lenChunk = MB2WC(NULL, in, 0);
 246         if ( lenChunk == 0 )
 247         {
 248             // nothing left in the input string, conversion succeeded
 249             break;
 250         }
 251
 252         if ( lenChunk == (size_t)-1 )
 253             break;
 254
 255         // if we already have a previous chunk, leave the NUL separating it
 256         // from this one
 257         if ( lenBuf )
 258             lenBuf++;
 259
 260         const size_t lenBufNew = lenBuf + lenChunk;
 261         if ( !wbuf.extend(lenBufNew) )
 262         {
 263             lenChunk = (size_t)-1;
 264             break;
 265         }
 266
 267         lenChunk = MB2WC(wbuf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 268         if ( lenChunk == (size_t)-1 )
 269             break;
 270
 271         lenBuf = lenBufNew;
 272
 273         if ( inLen == (size_t)-1 )
 274         {
 275             // convert only one chunk in this case, as we suppose that the
 276             // string is NUL-terminated and so inEnd is not used at all
 277             break;
 278         }
 279
 280         // advance the input pointer past the end of this chunk
 281         while ( NotAllNULs(in, nulLen) )
 282         {
 283             // notice that we must skip over multiple bytes here as we suppose
 284             // that if NUL takes 2 or 4 bytes, then all the other characters do
 285             // too and so if advanced by a single byte we might erroneously
 286             // detect sequences of NUL bytes in the middle of the input
 287             in += nulLen;
 288         }
 289
 290         in += nulLen; // skipping over its terminator as well
 291
 292         // note that ">=" (and not just "==") is needed here as the terminator
 293         // we skipped just above could be inside or just after the buffer
 294         // delimited by inEnd
 295         if ( in >= inEnd )
 296             break;
 297     }
 298
 299     if ( lenChunk == (size_t)-1 )
 300     {
 301         // conversion failed
 302         lenBuf = 0;
 303         wbuf.reset();
 304     }
 305
 306     if ( outLen )
 307         *outLen = lenBuf;
 308
 309     return wbuf;
 310 }
 311
 312 const wxCharBuffer
 313 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 314 {
 315     // the currently accumulated multibyte characters
 316     wxCharBuffer buf;
 317
 318     // the current length of buf
 319     size_t lenBuf = 0;
 320
 321     // make a copy of the input string unless it is already properly
 322     // NUL-terminated
 323     //
 324     // if we don't know its length we have no choice but to assume that it is,
 325     // indeed, properly terminated
 326     wxWCharBuffer bufTmp;
 327     if ( inLen == (size_t)-1 )
 328     {
 329         inLen = wxWcslen(in) + 1;
 330     }
 331     else if ( inLen != 0 && in[inLen - 1] != L'\0' )
 332     {
 333         // make a copy in order to properly NUL-terminate the string
 334         bufTmp = wxWCharBuffer(inLen);
 335         memcpy(bufTmp.data(), in, inLen*sizeof(wchar_t));
 336     }
 337
 338     if ( bufTmp )
 339         in = bufTmp;
 340
 341     for ( const wchar_t * const inEnd = in + inLen;; )
 342     {
 343         // try to convert the current chunk, if anything left
 344         size_t lenChunk = in < inEnd ? WC2MB(NULL, in, 0) : 0;
 345         if ( lenChunk == 0 )
 346         {
 347             // nothing left in the input string, conversion succeeded
 348             if ( outLen )
 349                 *outLen = lenBuf ? lenBuf - 1 : lenBuf;
 350
 351             return buf;
 352         }
 353
 354         if ( lenChunk == (size_t)-1 )
 355             break;
 356
 357         const size_t lenBufNew = lenBuf + lenChunk;
 358         if ( !buf.extend(lenBufNew) )
 359             break;
 360
 361         lenChunk = WC2MB(buf.data() + lenBuf, in, lenChunk + 1 /* for NUL */);
 362         if ( lenChunk == (size_t)-1 )
 363             break;
 364
 365         // chunk successfully converted, go to the next one
 366         in += wxWcslen(in) + 1 /* skip NUL too */;
 367         lenBuf = lenBufNew + 1;
 368     }
 369
 370     // conversion failed
 371     if ( outLen )
 372         *outLen = 0;
 373
 374     return wxCharBuffer();
 375 }
 376
 377 // ----------------------------------------------------------------------------
 378 // wxMBConvLibc
 379 // ----------------------------------------------------------------------------
 380
 381 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 382 {
 383     return wxMB2WC(buf, psz, n);
 384 }
 385
 386 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 387 {
 388     return wxWC2MB(buf, psz, n);
 389 }
 390
 391 // ----------------------------------------------------------------------------
 392 // wxConvBrokenFileNames
 393 // ----------------------------------------------------------------------------
 394
 395 #ifdef __UNIX__
 396
 397 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 398 {
 399     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 400                   || wxStricmp(charset, _T("UTF8")) == 0  )
 401         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 402     else
 403         m_conv = new wxCSConv(charset);
 404 }
 405
 406 #endif // __UNIX__
 407
 408 // ----------------------------------------------------------------------------
 409 // UTF-7
 410 // ----------------------------------------------------------------------------
 411
 412 // Implementation (C) 2004 Fredrik Roubert
 413
 414 //
 415 // BASE64 decoding table
 416 //
 417 static const unsigned char utf7unb64[] =
 418 {
 419     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 420     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 421     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 422     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 423     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 424     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 425     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 426     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 427     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 428     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 429     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 430     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 431     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 432     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 433     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 434     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 435     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 436     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 437     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 438     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 439     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 440     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 441     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 442     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 443     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 444     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 445     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 446     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 447     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 448     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 449     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 450     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 451 };
 452
 453 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 454 {
 455     size_t len = 0;
 456
 457     while ( *psz && (!buf || (len < n)) )
 458     {
 459         unsigned char cc = *psz++;
 460         if (cc != '+')
 461         {
 462             // plain ASCII char
 463             if (buf)
 464                 *buf++ = cc;
 465             len++;
 466         }
 467         else if (*psz == '-')
 468         {
 469             // encoded plus sign
 470             if (buf)
 471                 *buf++ = cc;
 472             len++;
 473             psz++;
 474         }
 475         else // start of BASE64 encoded string
 476         {
 477             bool lsb, ok;
 478             unsigned int d, l;
 479             for ( ok = lsb = false, d = 0, l = 0;
 480                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 481                   psz++ )
 482             {
 483                 d <<= 6;
 484                 d += cc;
 485                 for (l += 6; l >= 8; lsb = !lsb)
 486                 {
 487                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 488                     if (lsb)
 489                     {
 490                         if (buf)
 491                             *buf++ |= c;
 492                         len ++;
 493                     }
 494                     else
 495                     {
 496                         if (buf)
 497                             *buf = (wchar_t)(c << 8);
 498                     }
 499
 500                     ok = true;
 501                 }
 502             }
 503
 504             if ( !ok )
 505             {
 506                 // in valid UTF7 we should have valid characters after '+'
 507                 return (size_t)-1;
 508             }
 509
 510             if (*psz == '-')
 511                 psz++;
 512         }
 513     }
 514
 515     if ( buf && (len < n) )
 516         *buf = '\0';
 517
 518     return len;
 519 }
 520
 521 //
 522 // BASE64 encoding table
 523 //
 524 static const unsigned char utf7enb64[] =
 525 {
 526     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 527     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 528     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 529     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 530     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 531     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 532     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 533     '4', '5', '6', '7', '8', '9', '+', '/'
 534 };
 535
 536 //
 537 // UTF-7 encoding table
 538 //
 539 // 0 - Set D (directly encoded characters)
 540 // 1 - Set O (optional direct characters)
 541 // 2 - whitespace characters (optional)
 542 // 3 - special characters
 543 //
 544 static const unsigned char utf7encode[128] =
 545 {
 546     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 547     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 548     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 549     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 550     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 551     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 552     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 553     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 554 };
 555
 556 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 557 {
 558     size_t len = 0;
 559
 560     while (*psz && ((!buf) || (len < n)))
 561     {
 562         wchar_t cc = *psz++;
 563         if (cc < 0x80 && utf7encode[cc] < 1)
 564         {
 565             // plain ASCII char
 566             if (buf)
 567                 *buf++ = (char)cc;
 568             len++;
 569         }
 570 #ifndef WC_UTF16
 571         else if (((wxUint32)cc) > 0xffff)
 572         {
 573             // no surrogate pair generation (yet?)
 574             return (size_t)-1;
 575         }
 576 #endif
 577         else
 578         {
 579             if (buf)
 580                 *buf++ = '+';
 581             len++;
 582             if (cc != '+')
 583             {
 584                 // BASE64 encode string
 585                 unsigned int lsb, d, l;
 586                 for (d = 0, l = 0; /*nothing*/; psz++)
 587                 {
 588                     for (lsb = 0; lsb < 2; lsb ++)
 589                     {
 590                         d <<= 8;
 591                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 592
 593                         for (l += 8; l >= 6; )
 594                         {
 595                             l -= 6;
 596                             if (buf)
 597                                 *buf++ = utf7enb64[(d >> l) % 64];
 598                             len++;
 599                         }
 600                     }
 601                     cc = *psz;
 602                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 603                         break;
 604                 }
 605                 if (l != 0)
 606                 {
 607                     if (buf)
 608                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 609                     len++;
 610                 }
 611             }
 612             if (buf)
 613                 *buf++ = '-';
 614             len++;
 615         }
 616     }
 617     if (buf && (len < n))
 618         *buf = 0;
 619     return len;
 620 }
 621
 622 // ----------------------------------------------------------------------------
 623 // UTF-8
 624 // ----------------------------------------------------------------------------
 625
 626 static wxUint32 utf8_max[]=
 627     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 628
 629 // boundaries of the private use area we use to (temporarily) remap invalid
 630 // characters invalid in a UTF-8 encoded string
 631 const wxUint32 wxUnicodePUA = 0x100000;
 632 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 633
 634 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 635 {
 636     size_t len = 0;
 637
 638     while (*psz && ((!buf) || (len < n)))
 639     {
 640         const char *opsz = psz;
 641         bool invalid = false;
 642         unsigned char cc = *psz++, fc = cc;
 643         unsigned cnt;
 644         for (cnt = 0; fc & 0x80; cnt++)
 645             fc <<= 1;
 646         if (!cnt)
 647         {
 648             // plain ASCII char
 649             if (buf)
 650                 *buf++ = cc;
 651             len++;
 652
 653             // escape the escape character for octal escapes
 654             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 655                     && cc == '\\' && (!buf || len < n))
 656             {
 657                 if (buf)
 658                     *buf++ = cc;
 659                 len++;
 660             }
 661         }
 662         else
 663         {
 664             cnt--;
 665             if (!cnt)
 666             {
 667                 // invalid UTF-8 sequence
 668                 invalid = true;
 669             }
 670             else
 671             {
 672                 unsigned ocnt = cnt - 1;
 673                 wxUint32 res = cc & (0x3f >> cnt);
 674                 while (cnt--)
 675                 {
 676                     cc = *psz;
 677                     if ((cc & 0xC0) != 0x80)
 678                     {
 679                         // invalid UTF-8 sequence
 680                         invalid = true;
 681                         break;
 682                     }
 683                     psz++;
 684                     res = (res << 6) | (cc & 0x3f);
 685                 }
 686                 if (invalid || res <= utf8_max[ocnt])
 687                 {
 688                     // illegal UTF-8 encoding
 689                     invalid = true;
 690                 }
 691                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 692                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 693                 {
 694                     // if one of our PUA characters turns up externally
 695                     // it must also be treated as an illegal sequence
 696                     // (a bit like you have to escape an escape character)
 697                     invalid = true;
 698                 }
 699                 else
 700                 {
 701 #ifdef WC_UTF16
 702                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 703                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 704                     if (pa == (size_t)-1)
 705                     {
 706                         invalid = true;
 707                     }
 708                     else
 709                     {
 710                         if (buf)
 711                             buf += pa;
 712                         len += pa;
 713                     }
 714 #else // !WC_UTF16
 715                     if (buf)
 716                         *buf++ = (wchar_t)res;
 717                     len++;
 718 #endif // WC_UTF16/!WC_UTF16
 719                 }
 720             }
 721             if (invalid)
 722             {
 723                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 724                 {
 725                     while (opsz < psz && (!buf || len < n))
 726                     {
 727 #ifdef WC_UTF16
 728                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 729                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 730                         wxASSERT(pa != (size_t)-1);
 731                         if (buf)
 732                             buf += pa;
 733                         opsz++;
 734                         len += pa;
 735 #else
 736                         if (buf)
 737                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 738                         opsz++;
 739                         len++;
 740 #endif
 741                     }
 742                 }
 743                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 744                 {
 745                     while (opsz < psz && (!buf || len < n))
 746                     {
 747                         if ( buf && len + 3 < n )
 748                         {
 749                             unsigned char on = *opsz;
 750                             *buf++ = L'\\';
 751                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 752                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 753                             *buf++ = (wchar_t)( L'0' + on % 010 );
 754                         }
 755                         opsz++;
 756                         len += 4;
 757                     }
 758                 }
 759                 else // MAP_INVALID_UTF8_NOT
 760                 {
 761                     return (size_t)-1;
 762                 }
 763             }
 764         }
 765     }
 766     if (buf && (len < n))
 767         *buf = 0;
 768     return len;
 769 }
 770
 771 static inline bool isoctal(wchar_t wch)
 772 {
 773     return L'0' <= wch && wch <= L'7';
 774 }
 775
 776 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 777 {
 778     size_t len = 0;
 779
 780     while (*psz && ((!buf) || (len < n)))
 781     {
 782         wxUint32 cc;
 783 #ifdef WC_UTF16
 784         // cast is ok for WC_UTF16
 785         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 786         psz += (pa == (size_t)-1) ? 1 : pa;
 787 #else
 788         cc=(*psz++) & 0x7fffffff;
 789 #endif
 790
 791         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 792                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 793         {
 794             if (buf)
 795                 *buf++ = (char)(cc - wxUnicodePUA);
 796             len++;
 797         }
 798         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 799                     && cc == L'\\' && psz[0] == L'\\' )
 800         {
 801             if (buf)
 802                 *buf++ = (char)cc;
 803             psz++;
 804             len++;
 805         }
 806         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 807                     cc == L'\\' &&
 808                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 809         {
 810             if (buf)
 811             {
 812                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 813                                  (psz[1] - L'0')*010 +
 814                                  (psz[2] - L'0'));
 815             }
 816
 817             psz += 3;
 818             len++;
 819         }
 820         else
 821         {
 822             unsigned cnt;
 823             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 824             if (!cnt)
 825             {
 826                 // plain ASCII char
 827                 if (buf)
 828                     *buf++ = (char) cc;
 829                 len++;
 830             }
 831
 832             else
 833             {
 834                 len += cnt + 1;
 835                 if (buf)
 836                 {
 837                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 838                     while (cnt--)
 839                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 840                 }
 841             }
 842         }
 843     }
 844
 845     if (buf && (len<n))
 846         *buf = 0;
 847
 848     return len;
 849 }
 850
 851 // ----------------------------------------------------------------------------
 852 // UTF-16
 853 // ----------------------------------------------------------------------------
 854
 855 #ifdef WORDS_BIGENDIAN
 856     #define wxMBConvUTF16straight wxMBConvUTF16BE
 857     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 858 #else
 859     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 860     #define wxMBConvUTF16straight wxMBConvUTF16LE
 861 #endif
 862
 863
 864 #ifdef WC_UTF16
 865
 866 // copy 16bit MB to 16bit String
 867 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 868 {
 869     size_t len=0;
 870
 871     while (*(wxUint16*)psz && (!buf || len < n))
 872     {
 873         if (buf)
 874             *buf++ = *(wxUint16*)psz;
 875         len++;
 876
 877         psz += sizeof(wxUint16);
 878     }
 879     if (buf && len<n)   *buf=0;
 880
 881     return len;
 882 }
 883
 884
 885 // copy 16bit String to 16bit MB
 886 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 887 {
 888     size_t len=0;
 889
 890     while (*psz && (!buf || len < n))
 891     {
 892         if (buf)
 893         {
 894             *(wxUint16*)buf = *psz;
 895             buf += sizeof(wxUint16);
 896         }
 897         len += sizeof(wxUint16);
 898         psz++;
 899     }
 900     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 901
 902     return len;
 903 }
 904
 905
 906 // swap 16bit MB to 16bit String
 907 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 908 {
 909     size_t len = 0;
 910
 911     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 912     // inside the string
 913     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 914     {
 915         if ( buf )
 916         {
 917             ((char *)buf)[0] = psz[1];
 918             ((char *)buf)[1] = psz[0];
 919             buf++;
 920         }
 921         len++;
 922         psz += 2;
 923     }
 924
 925     if ( buf && len < n )
 926         *buf = L'\0';
 927
 928     return len;
 929 }
 930
 931
 932 // swap 16bit MB to 16bit String
 933 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 934 {
 935     size_t len = 0;
 936
 937     while ( *psz && (!buf || len < n) )
 938     {
 939         if ( buf )
 940         {
 941             *buf++ = ((char*)psz)[1];
 942             *buf++ = ((char*)psz)[0];
 943         }
 944         len += 2;
 945         psz++;
 946     }
 947
 948     if ( buf && len < n )
 949         *buf = '\0';
 950
 951     return len;
 952 }
 953
 954
 955 #else // WC_UTF16
 956
 957
 958 // copy 16bit MB to 32bit String
 959 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 960 {
 961     size_t len=0;
 962
 963     while (*(wxUint16*)psz && (!buf || len < n))
 964     {
 965         wxUint32 cc;
 966         size_t pa=decode_utf16((wxUint16*)psz, cc);
 967         if (pa == (size_t)-1)
 968             return pa;
 969
 970         if (buf)
 971             *buf++ = (wchar_t)cc;
 972         len++;
 973         psz += pa * sizeof(wxUint16);
 974     }
 975     if (buf && len<n)   *buf=0;
 976
 977     return len;
 978 }
 979
 980
 981 // copy 32bit String to 16bit MB
 982 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 983 {
 984     size_t len=0;
 985
 986     while (*psz && (!buf || len < n))
 987     {
 988         wxUint16 cc[2];
 989         size_t pa=encode_utf16(*psz, cc);
 990
 991         if (pa == (size_t)-1)
 992             return pa;
 993
 994         if (buf)
 995         {
 996             *(wxUint16*)buf = cc[0];
 997             buf += sizeof(wxUint16);
 998             if (pa > 1)
 999             {
1000                 *(wxUint16*)buf = cc[1];
1001                 buf += sizeof(wxUint16);
1002             }
1003         }
1004
1005         len += pa*sizeof(wxUint16);
1006         psz++;
1007     }
1008     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1009
1010     return len;
1011 }
1012
1013
1014 // swap 16bit MB to 32bit String
1015 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1016 {
1017     size_t len=0;
1018
1019     while (*(wxUint16*)psz && (!buf || len < n))
1020     {
1021         wxUint32 cc;
1022         char tmp[4];
1023         tmp[0]=psz[1];  tmp[1]=psz[0];
1024         tmp[2]=psz[3];  tmp[3]=psz[2];
1025
1026         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1027         if (pa == (size_t)-1)
1028             return pa;
1029
1030         if (buf)
1031             *buf++ = (wchar_t)cc;
1032
1033         len++;
1034         psz += pa * sizeof(wxUint16);
1035     }
1036     if (buf && len<n)   *buf=0;
1037
1038     return len;
1039 }
1040
1041
1042 // swap 32bit String to 16bit MB
1043 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1044 {
1045     size_t len=0;
1046
1047     while (*psz && (!buf || len < n))
1048     {
1049         wxUint16 cc[2];
1050         size_t pa=encode_utf16(*psz, cc);
1051
1052         if (pa == (size_t)-1)
1053             return pa;
1054
1055         if (buf)
1056         {
1057             *buf++ = ((char*)cc)[1];
1058             *buf++ = ((char*)cc)[0];
1059             if (pa > 1)
1060             {
1061                 *buf++ = ((char*)cc)[3];
1062                 *buf++ = ((char*)cc)[2];
1063             }
1064         }
1065
1066         len += pa*sizeof(wxUint16);
1067         psz++;
1068     }
1069     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1070
1071     return len;
1072 }
1073
1074 #endif // WC_UTF16
1075
1076
1077 // ----------------------------------------------------------------------------
1078 // UTF-32
1079 // ----------------------------------------------------------------------------
1080
1081 #ifdef WORDS_BIGENDIAN
1082 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1083 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1084 #else
1085 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1086 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1087 #endif
1088
1089
1090 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1091 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1092
1093
1094 #ifdef WC_UTF16
1095
1096 // copy 32bit MB to 16bit String
1097 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1098 {
1099     size_t len=0;
1100
1101     while (*(wxUint32*)psz && (!buf || len < n))
1102     {
1103         wxUint16 cc[2];
1104
1105         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1106         if (pa == (size_t)-1)
1107             return pa;
1108
1109         if (buf)
1110         {
1111             *buf++ = cc[0];
1112             if (pa > 1)
1113                 *buf++ = cc[1];
1114         }
1115         len += pa;
1116         psz += sizeof(wxUint32);
1117     }
1118     if (buf && len<n)   *buf=0;
1119
1120     return len;
1121 }
1122
1123
1124 // copy 16bit String to 32bit MB
1125 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1126 {
1127     size_t len=0;
1128
1129     while (*psz && (!buf || len < n))
1130     {
1131         wxUint32 cc;
1132
1133         // cast is ok for WC_UTF16
1134         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1135         if (pa == (size_t)-1)
1136             return pa;
1137
1138         if (buf)
1139         {
1140             *(wxUint32*)buf = cc;
1141             buf += sizeof(wxUint32);
1142         }
1143         len += sizeof(wxUint32);
1144         psz += pa;
1145     }
1146
1147     if (buf && len<=n-sizeof(wxUint32))
1148         *(wxUint32*)buf=0;
1149
1150     return len;
1151 }
1152
1153
1154
1155 // swap 32bit MB to 16bit String
1156 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1157 {
1158     size_t len=0;
1159
1160     while (*(wxUint32*)psz && (!buf || len < n))
1161     {
1162         char tmp[4];
1163         tmp[0] = psz[3];   tmp[1] = psz[2];
1164         tmp[2] = psz[1];   tmp[3] = psz[0];
1165
1166
1167         wxUint16 cc[2];
1168
1169         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1170         if (pa == (size_t)-1)
1171             return pa;
1172
1173         if (buf)
1174         {
1175             *buf++ = cc[0];
1176             if (pa > 1)
1177                 *buf++ = cc[1];
1178         }
1179         len += pa;
1180         psz += sizeof(wxUint32);
1181     }
1182
1183     if (buf && len<n)
1184         *buf=0;
1185
1186     return len;
1187 }
1188
1189
1190 // swap 16bit String to 32bit MB
1191 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1192 {
1193     size_t len=0;
1194
1195     while (*psz && (!buf || len < n))
1196     {
1197         char cc[4];
1198
1199         // cast is ok for WC_UTF16
1200         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1201         if (pa == (size_t)-1)
1202             return pa;
1203
1204         if (buf)
1205         {
1206             *buf++ = cc[3];
1207             *buf++ = cc[2];
1208             *buf++ = cc[1];
1209             *buf++ = cc[0];
1210         }
1211         len += sizeof(wxUint32);
1212         psz += pa;
1213     }
1214
1215     if (buf && len<=n-sizeof(wxUint32))
1216         *(wxUint32*)buf=0;
1217
1218     return len;
1219 }
1220
1221 #else // WC_UTF16
1222
1223
1224 // copy 32bit MB to 32bit String
1225 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1226 {
1227     size_t len=0;
1228
1229     while (*(wxUint32*)psz && (!buf || len < n))
1230     {
1231         if (buf)
1232             *buf++ = (wchar_t)(*(wxUint32*)psz);
1233         len++;
1234         psz += sizeof(wxUint32);
1235     }
1236
1237     if (buf && len<n)
1238         *buf=0;
1239
1240     return len;
1241 }
1242
1243
1244 // copy 32bit String to 32bit MB
1245 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1246 {
1247     size_t len=0;
1248
1249     while (*psz && (!buf || len < n))
1250     {
1251         if (buf)
1252         {
1253             *(wxUint32*)buf = *psz;
1254             buf += sizeof(wxUint32);
1255         }
1256
1257         len += sizeof(wxUint32);
1258         psz++;
1259     }
1260
1261     if (buf && len<=n-sizeof(wxUint32))
1262         *(wxUint32*)buf=0;
1263
1264     return len;
1265 }
1266
1267
1268 // swap 32bit MB to 32bit String
1269 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1270 {
1271     size_t len=0;
1272
1273     while (*(wxUint32*)psz && (!buf || len < n))
1274     {
1275         if (buf)
1276         {
1277             ((char *)buf)[0] = psz[3];
1278             ((char *)buf)[1] = psz[2];
1279             ((char *)buf)[2] = psz[1];
1280             ((char *)buf)[3] = psz[0];
1281             buf++;
1282         }
1283         len++;
1284         psz += sizeof(wxUint32);
1285     }
1286
1287     if (buf && len<n)
1288         *buf=0;
1289
1290     return len;
1291 }
1292
1293
1294 // swap 32bit String to 32bit MB
1295 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1296 {
1297     size_t len=0;
1298
1299     while (*psz && (!buf || len < n))
1300     {
1301         if (buf)
1302         {
1303             *buf++ = ((char *)psz)[3];
1304             *buf++ = ((char *)psz)[2];
1305             *buf++ = ((char *)psz)[1];
1306             *buf++ = ((char *)psz)[0];
1307         }
1308         len += sizeof(wxUint32);
1309         psz++;
1310     }
1311
1312     if (buf && len<=n-sizeof(wxUint32))
1313         *(wxUint32*)buf=0;
1314
1315     return len;
1316 }
1317
1318
1319 #endif // WC_UTF16
1320
1321
1322 // ============================================================================
1323 // The classes doing conversion using the iconv_xxx() functions
1324 // ============================================================================
1325
1326 #ifdef HAVE_ICONV
1327
1328 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1329 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1330 //     (unless there's yet another bug in glibc) the only case when iconv()
1331 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1332 //     left in the input buffer -- when _real_ error occurs,
1333 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1334 //     iconv() failure.
1335 //     [This bug does not appear in glibc 2.2.]
1336 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1337 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1338                                      (errno != E2BIG || bufLeft != 0))
1339 #else
1340 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1341 #endif
1342
1343 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1344
1345 #define ICONV_T_INVALID ((iconv_t)-1)
1346
1347 #if SIZEOF_WCHAR_T == 4
1348     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1349     #define WC_ENC      wxFONTENCODING_UTF32
1350 #elif SIZEOF_WCHAR_T == 2
1351     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1352     #define WC_ENC      wxFONTENCODING_UTF16
1353 #else // sizeof(wchar_t) != 2 nor 4
1354     // does this ever happen?
1355     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1356 #endif
1357
1358 // ----------------------------------------------------------------------------
1359 // wxMBConv_iconv: encapsulates an iconv character set
1360 // ----------------------------------------------------------------------------
1361
1362 class wxMBConv_iconv : public wxMBConv
1363 {
1364 public:
1365     wxMBConv_iconv(const wxChar *name);
1366     virtual ~wxMBConv_iconv();
1367
1368     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1369     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1370
1371     bool IsOk() const
1372         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1373
1374 protected:
1375     // the iconv handlers used to translate from multibyte to wide char and in
1376     // the other direction
1377     iconv_t m2w,
1378             w2m;
1379 #if wxUSE_THREADS
1380     // guards access to m2w and w2m objects
1381     wxMutex m_iconvMutex;
1382 #endif
1383
1384 private:
1385     // classify this encoding as explained in wxMBConv::GetMinMBCharWidth()
1386     // comment
1387     virtual size_t GetMinMBCharWidth() const;
1388
1389     // the name (for iconv_open()) of a wide char charset -- if none is
1390     // available on this machine, it will remain NULL
1391     static wxString ms_wcCharsetName;
1392
1393     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1394     // different endian-ness than the native one
1395     static bool ms_wcNeedsSwap;
1396
1397     // cached result of GetMinMBCharWidth(); set to 0 meaning "unknown"
1398     // initially
1399     size_t m_minMBCharWidth;
1400 };
1401
1402 // make the constructor available for unit testing
1403 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1404 {
1405     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1406     if ( !result->IsOk() )
1407     {
1408         delete result;
1409         return 0;
1410     }
1411     return result;
1412 }
1413
1414 wxString wxMBConv_iconv::ms_wcCharsetName;
1415 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1416
1417 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1418 {
1419     m_minMBCharWidth = 0;
1420
1421     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1422     // names for the charsets
1423     const wxCharBuffer cname(wxString(name).ToAscii());
1424
1425     // check for charset that represents wchar_t:
1426     if ( ms_wcCharsetName.empty() )
1427     {
1428         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1429
1430 #if wxUSE_FONTMAP
1431         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1432 #else // !wxUSE_FONTMAP
1433         static const wxChar *names[] =
1434         {
1435 #if SIZEOF_WCHAR_T == 4
1436             _T("UCS-4"),
1437 #elif SIZEOF_WCHAR_T = 2
1438             _T("UCS-2"),
1439 #endif
1440             NULL
1441         };
1442 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1443
1444         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1445         {
1446             const wxString nameCS(*names);
1447
1448             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1449             wxString nameXE(nameCS);
1450             #ifdef WORDS_BIGENDIAN
1451                 nameXE += _T("BE");
1452             #else // little endian
1453                 nameXE += _T("LE");
1454             #endif
1455
1456             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1457                        nameXE.c_str());
1458
1459             m2w = iconv_open(nameXE.ToAscii(), cname);
1460             if ( m2w == ICONV_T_INVALID )
1461             {
1462                 // try charset w/o bytesex info (e.g. "UCS4")
1463                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1464                            nameCS.c_str());
1465                 m2w = iconv_open(nameCS.ToAscii(), cname);
1466
1467                 // and check for bytesex ourselves:
1468                 if ( m2w != ICONV_T_INVALID )
1469                 {
1470                     char    buf[2], *bufPtr;
1471                     wchar_t wbuf[2], *wbufPtr;
1472                     size_t  insz, outsz;
1473                     size_t  res;
1474
1475                     buf[0] = 'A';
1476                     buf[1] = 0;
1477                     wbuf[0] = 0;
1478                     insz = 2;
1479                     outsz = SIZEOF_WCHAR_T * 2;
1480                     wbufPtr = wbuf;
1481                     bufPtr = buf;
1482
1483                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1484                                 (char**)&wbufPtr, &outsz);
1485
1486                     if (ICONV_FAILED(res, insz))
1487                     {
1488                         wxLogLastError(wxT("iconv"));
1489                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1490                                    nameCS.c_str());
1491                     }
1492                     else // ok, can convert to this encoding, remember it
1493                     {
1494                         ms_wcCharsetName = nameCS;
1495                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1496                     }
1497                 }
1498             }
1499             else // use charset not requiring byte swapping
1500             {
1501                 ms_wcCharsetName = nameXE;
1502             }
1503         }
1504
1505         wxLogTrace(TRACE_STRCONV,
1506                    wxT("iconv wchar_t charset is \"%s\"%s"),
1507                    ms_wcCharsetName.empty() ? _T("<none>")
1508                                             : ms_wcCharsetName.c_str(),
1509                    ms_wcNeedsSwap ? _T(" (needs swap)")
1510                                   : _T(""));
1511     }
1512     else // we already have ms_wcCharsetName
1513     {
1514         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1515     }
1516
1517     if ( ms_wcCharsetName.empty() )
1518     {
1519         w2m = ICONV_T_INVALID;
1520     }
1521     else
1522     {
1523         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1524         if ( w2m == ICONV_T_INVALID )
1525         {
1526             wxLogTrace(TRACE_STRCONV,
1527                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1528                        ms_wcCharsetName.c_str(), cname.data());
1529         }
1530     }
1531 }
1532
1533 wxMBConv_iconv::~wxMBConv_iconv()
1534 {
1535     if ( m2w != ICONV_T_INVALID )
1536         iconv_close(m2w);
1537     if ( w2m != ICONV_T_INVALID )
1538         iconv_close(w2m);
1539 }
1540
1541 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1542 {
1543 #if wxUSE_THREADS
1544     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1545     //     Unfortunately there is a couple of global wxCSConv objects such as
1546     //     wxConvLocal that are used all over wx code, so we have to make sure
1547     //     the handle is used by at most one thread at the time. Otherwise
1548     //     only a few wx classes would be safe to use from non-main threads
1549     //     as MB<->WC conversion would fail "randomly".
1550     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1551 #endif
1552
1553     size_t inbuf = strlen(psz);
1554     size_t outbuf = n * SIZEOF_WCHAR_T;
1555     size_t res, cres;
1556     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1557     wchar_t *bufPtr = buf;
1558     const char *pszPtr = psz;
1559
1560     if (buf)
1561     {
1562         // have destination buffer, convert there
1563         cres = iconv(m2w,
1564                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1565                      (char**)&bufPtr, &outbuf);
1566         res = n - (outbuf / SIZEOF_WCHAR_T);
1567
1568         if (ms_wcNeedsSwap)
1569         {
1570             // convert to native endianness
1571             for ( unsigned i = 0; i < res; i++ )
1572                 buf[n] = WC_BSWAP(buf[i]);
1573         }
1574
1575         // NB: iconv was given only strlen(psz) characters on input, and so
1576         //     it couldn't convert the trailing zero. Let's do it ourselves
1577         //     if there's some room left for it in the output buffer.
1578         if (res < n)
1579             buf[res] = 0;
1580     }
1581     else
1582     {
1583         // no destination buffer... convert using temp buffer
1584         // to calculate destination buffer requirement
1585         wchar_t tbuf[8];
1586         res = 0;
1587         do {
1588             bufPtr = tbuf;
1589             outbuf = 8*SIZEOF_WCHAR_T;
1590
1591             cres = iconv(m2w,
1592                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1593                          (char**)&bufPtr, &outbuf );
1594
1595             res += 8-(outbuf/SIZEOF_WCHAR_T);
1596         } while ((cres==(size_t)-1) && (errno==E2BIG));
1597     }
1598
1599     if (ICONV_FAILED(cres, inbuf))
1600     {
1601         //VS: it is ok if iconv fails, hence trace only
1602         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1603         return (size_t)-1;
1604     }
1605
1606     return res;
1607 }
1608
1609 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1610 {
1611 #if wxUSE_THREADS
1612     // NB: explained in MB2WC
1613     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1614 #endif
1615
1616     size_t inlen = wxWcslen(psz);
1617     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1618     size_t outbuf = n;
1619     size_t res, cres;
1620
1621     wchar_t *tmpbuf = 0;
1622
1623     if (ms_wcNeedsSwap)
1624     {
1625         // need to copy to temp buffer to switch endianness
1626         // (doing WC_BSWAP twice on the original buffer won't help, as it
1627         //  could be in read-only memory, or be accessed in some other thread)
1628         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1629         for ( size_t i = 0; i < inlen; i++ )
1630             tmpbuf[n] = WC_BSWAP(psz[i]);
1631         tmpbuf[inlen] = L'\0';
1632         psz = tmpbuf;
1633     }
1634
1635     if (buf)
1636     {
1637         // have destination buffer, convert there
1638         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1639
1640         res = n-outbuf;
1641
1642         // NB: iconv was given only wcslen(psz) characters on input, and so
1643         //     it couldn't convert the trailing zero. Let's do it ourselves
1644         //     if there's some room left for it in the output buffer.
1645         if (res < n)
1646             buf[0] = 0;
1647     }
1648     else
1649     {
1650         // no destination buffer... convert using temp buffer
1651         // to calculate destination buffer requirement
1652         char tbuf[16];
1653         res = 0;
1654         do {
1655             buf = tbuf; outbuf = 16;
1656
1657             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1658
1659             res += 16 - outbuf;
1660         } while ((cres==(size_t)-1) && (errno==E2BIG));
1661     }
1662
1663     if (ms_wcNeedsSwap)
1664     {
1665         free(tmpbuf);
1666     }
1667
1668     if (ICONV_FAILED(cres, inbuf))
1669     {
1670         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1671         return (size_t)-1;
1672     }
1673
1674     return res;
1675 }
1676
1677 size_t wxMBConv_iconv::GetMinMBCharWidth() const
1678 {
1679     if ( m_minMBCharWidth == 0 )
1680     {
1681         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1682
1683 #if wxUSE_THREADS
1684         // NB: explained in MB2WC
1685         wxMutexLocker lock(self->m_iconvMutex);
1686 #endif
1687
1688         wchar_t *wnul = L"";
1689         char buf[8]; // should be enough for NUL in any encoding
1690         size_t inLen = sizeof(wchar_t),
1691                outLen = WXSIZEOF(buf);
1692         char *in = (char *)wnul;
1693         char *out = buf;
1694         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1695         {
1696             self->m_minMBCharWidth = (size_t)-1;
1697         }
1698         else // ok
1699         {
1700             self->m_minMBCharWidth = out - buf;
1701         }
1702     }
1703
1704     return m_minMBCharWidth;
1705 }
1706
1707 #endif // HAVE_ICONV
1708
1709
1710 // ============================================================================
1711 // Win32 conversion classes
1712 // ============================================================================
1713
1714 #ifdef wxHAVE_WIN32_MB2WC
1715
1716 // from utils.cpp
1717 #if wxUSE_FONTMAP
1718 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1719 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1720 #endif
1721
1722 class wxMBConv_win32 : public wxMBConv
1723 {
1724 public:
1725     wxMBConv_win32()
1726     {
1727         m_CodePage = CP_ACP;
1728         m_minMBCharWidth = 0;
1729     }
1730
1731 #if wxUSE_FONTMAP
1732     wxMBConv_win32(const wxChar* name)
1733     {
1734         m_CodePage = wxCharsetToCodepage(name);
1735         m_minMBCharWidth = 0;
1736     }
1737
1738     wxMBConv_win32(wxFontEncoding encoding)
1739     {
1740         m_CodePage = wxEncodingToCodepage(encoding);
1741         m_minMBCharWidth = 0;
1742     }
1743 #endif // wxUSE_FONTMAP
1744
1745     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1746     {
1747         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1748         // the behaviour is not compatible with the Unix version (using iconv)
1749         // and break the library itself, e.g. wxTextInputStream::NextChar()
1750         // wouldn't work if reading an incomplete MB char didn't result in an
1751         // error
1752         //
1753         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1754         // an error (tested under Windows Server 2003) and apparently it is
1755         // done on purpose, i.e. the function accepts any input in this case
1756         // and although I'd prefer to return error on ill-formed output, our
1757         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1758         // explicitly ill-formed according to RFC 2152) neither so we don't
1759         // even have any fallback here...
1760         //
1761         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1762         // Win XP or newer and if it is specified on older versions, conversion
1763         // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1764         // fails. So we can only use the flag on newer Windows versions.
1765         // Additionally, the flag is not supported by UTF7, symbol and CJK
1766         // encodings. See here:
1767         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1768         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1769         int flags = 0;
1770         if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1771              m_CodePage < 50000 &&
1772              IsAtLeastWin2kSP4() )
1773         {
1774             flags = MB_ERR_INVALID_CHARS;
1775         }
1776         else if ( m_CodePage == CP_UTF8 )
1777         {
1778             // Avoid round-trip in the special case of UTF-8 by using our
1779             // own UTF-8 conversion code:
1780             return wxMBConvUTF8().MB2WC(buf, psz, n);
1781         }
1782
1783         const size_t len = ::MultiByteToWideChar
1784                              (
1785                                 m_CodePage,     // code page
1786                                 flags,          // flags: fall on error
1787                                 psz,            // input string
1788                                 -1,             // its length (NUL-terminated)
1789                                 buf,            // output string
1790                                 buf ? n : 0     // size of output buffer
1791                              );
1792         if ( !len )
1793         {
1794             // function totally failed
1795             return (size_t)-1;
1796         }
1797
1798         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1799         // check if we succeeded, by doing a double trip:
1800         if ( !flags && buf )
1801         {
1802             const size_t mbLen = strlen(psz);
1803             wxCharBuffer mbBuf(mbLen);
1804             if ( ::WideCharToMultiByte
1805                    (
1806                       m_CodePage,
1807                       0,
1808                       buf,
1809                       -1,
1810                       mbBuf.data(),
1811                       mbLen + 1,        // size in bytes, not length
1812                       NULL,
1813                       NULL
1814                    ) == 0 ||
1815                   strcmp(mbBuf, psz) != 0 )
1816             {
1817                 // we didn't obtain the same thing we started from, hence
1818                 // the conversion was lossy and we consider that it failed
1819                 return (size_t)-1;
1820             }
1821         }
1822
1823         // note that it returns count of written chars for buf != NULL and size
1824         // of the needed buffer for buf == NULL so in either case the length of
1825         // the string (which never includes the terminating NUL) is one less
1826         return len - 1;
1827     }
1828
1829     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1830     {
1831         /*
1832             we have a problem here: by default, WideCharToMultiByte() may
1833             replace characters unrepresentable in the target code page with bad
1834             quality approximations such as turning "1/2" symbol (U+00BD) into
1835             "1" for the code pages which don't have it and we, obviously, want
1836             to avoid this at any price
1837
1838             the trouble is that this function does it _silently_, i.e. it won't
1839             even tell us whether it did or not... Win98/2000 and higher provide
1840             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1841             we have to resort to a round trip, i.e. check that converting back
1842             results in the same string -- this is, of course, expensive but
1843             otherwise we simply can't be sure to not garble the data.
1844          */
1845
1846         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1847         // it doesn't work with CJK encodings (which we test for rather roughly
1848         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1849         // supporting it
1850         BOOL usedDef wxDUMMY_INITIALIZE(false);
1851         BOOL *pUsedDef;
1852         int flags;
1853         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1854         {
1855             // it's our lucky day
1856             flags = WC_NO_BEST_FIT_CHARS;
1857             pUsedDef = &usedDef;
1858         }
1859         else // old system or unsupported encoding
1860         {
1861             flags = 0;
1862             pUsedDef = NULL;
1863         }
1864
1865         const size_t len = ::WideCharToMultiByte
1866                              (
1867                                 m_CodePage,     // code page
1868                                 flags,          // either none or no best fit
1869                                 pwz,            // input string
1870                                 -1,             // it is (wide) NUL-terminated
1871                                 buf,            // output buffer
1872                                 buf ? n : 0,    // and its size
1873                                 NULL,           // default "replacement" char
1874                                 pUsedDef        // [out] was it used?
1875                              );
1876
1877         if ( !len )
1878         {
1879             // function totally failed
1880             return (size_t)-1;
1881         }
1882
1883         // if we were really converting, check if we succeeded
1884         if ( buf )
1885         {
1886             if ( flags )
1887             {
1888                 // check if the conversion failed, i.e. if any replacements
1889                 // were done
1890                 if ( usedDef )
1891                     return (size_t)-1;
1892             }
1893             else // we must resort to double tripping...
1894             {
1895                 wxWCharBuffer wcBuf(n);
1896                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1897                         wcscmp(wcBuf, pwz) != 0 )
1898                 {
1899                     // we didn't obtain the same thing we started from, hence
1900                     // the conversion was lossy and we consider that it failed
1901                     return (size_t)-1;
1902                 }
1903             }
1904         }
1905
1906         // see the comment above for the reason of "len - 1"
1907         return len - 1;
1908     }
1909
1910     bool IsOk() const { return m_CodePage != -1; }
1911
1912 private:
1913     static bool CanUseNoBestFit()
1914     {
1915         static int s_isWin98Or2k = -1;
1916
1917         if ( s_isWin98Or2k == -1 )
1918         {
1919             int verMaj, verMin;
1920             switch ( wxGetOsVersion(&verMaj, &verMin) )
1921             {
1922                 case wxWIN95:
1923                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1924                     break;
1925
1926                 case wxWINDOWS_NT:
1927                     s_isWin98Or2k = verMaj >= 5;
1928                     break;
1929
1930                 default:
1931                     // unknown, be conseravtive by default
1932                     s_isWin98Or2k = 0;
1933             }
1934
1935             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1936         }
1937
1938         return s_isWin98Or2k == 1;
1939     }
1940
1941     static bool IsAtLeastWin2kSP4()
1942     {
1943 #ifdef __WXWINCE__
1944         return false;
1945 #else
1946         static int s_isAtLeastWin2kSP4 = -1;
1947
1948         if ( s_isAtLeastWin2kSP4 == -1 )
1949         {
1950             OSVERSIONINFOEX ver;
1951
1952             memset(&ver, 0, sizeof(ver));
1953             ver.dwOSVersionInfoSize = sizeof(ver);
1954             GetVersionEx((OSVERSIONINFO*)&ver);
1955
1956             s_isAtLeastWin2kSP4 =
1957               ((ver.dwMajorVersion > 5) || // Vista+
1958                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
1959                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
1960                ver.wServicePackMajor >= 4)) // 2000 SP4+
1961               ? 1 : 0;
1962         }
1963
1964         return s_isAtLeastWin2kSP4 == 1;
1965 #endif
1966     }
1967
1968     virtual size_t GetMinMBCharWidth() const
1969     {
1970         if ( m_minMBCharWidth == 0 )
1971         {
1972             int len = ::WideCharToMultiByte
1973                         (
1974                             m_CodePage,     // code page
1975                             0,              // no flags
1976                             L"",            // input string
1977                             1,              // translate just the NUL
1978                             NULL,           // output buffer
1979                             0,              // and its size
1980                             NULL,           // no replacement char
1981                             NULL            // [out] don't care if it was used
1982                         );
1983
1984             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1985             switch ( len )
1986             {
1987                 default:
1988                     wxLogDebug(_T("Unexpected NUL length %d"), len);
1989                     // fall through
1990
1991                 case 0:
1992                     self->m_minMBCharWidth = (size_t)-1;
1993                     break;
1994
1995                 case 1:
1996                 case 2:
1997                 case 4:
1998                     self->m_minMBCharWidth = len;
1999                     break;
2000             }
2001         }
2002
2003         return m_minMBCharWidth;
2004     }
2005
2006     // the code page we're working with
2007     long m_CodePage;
2008
2009     // cached result of GetMinMBCharWidth(), set to 0 initially meaning
2010     // "unknown"
2011     size_t m_minMBCharWidth;
2012 };
2013
2014 #endif // wxHAVE_WIN32_MB2WC
2015
2016 // ============================================================================
2017 // Cocoa conversion classes
2018 // ============================================================================
2019
2020 #if defined(__WXCOCOA__)
2021
2022 // RN:  There is no UTF-32 support in either Core Foundation or
2023 // Cocoa.  Strangely enough, internally Core Foundation uses
2024 // UTF 32 internally quite a bit - its just not public (yet).
2025
2026 #include <CoreFoundation/CFString.h>
2027 #include <CoreFoundation/CFStringEncodingExt.h>
2028
2029 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2030 {
2031     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2032     if ( encoding == wxFONTENCODING_DEFAULT )
2033     {
2034         enc = CFStringGetSystemEncoding();
2035     }
2036     else switch( encoding)
2037     {
2038         case wxFONTENCODING_ISO8859_1 :
2039             enc = kCFStringEncodingISOLatin1 ;
2040             break ;
2041         case wxFONTENCODING_ISO8859_2 :
2042             enc = kCFStringEncodingISOLatin2;
2043             break ;
2044         case wxFONTENCODING_ISO8859_3 :
2045             enc = kCFStringEncodingISOLatin3 ;
2046             break ;
2047         case wxFONTENCODING_ISO8859_4 :
2048             enc = kCFStringEncodingISOLatin4;
2049             break ;
2050         case wxFONTENCODING_ISO8859_5 :
2051             enc = kCFStringEncodingISOLatinCyrillic;
2052             break ;
2053         case wxFONTENCODING_ISO8859_6 :
2054             enc = kCFStringEncodingISOLatinArabic;
2055             break ;
2056         case wxFONTENCODING_ISO8859_7 :
2057             enc = kCFStringEncodingISOLatinGreek;
2058             break ;
2059         case wxFONTENCODING_ISO8859_8 :
2060             enc = kCFStringEncodingISOLatinHebrew;
2061             break ;
2062         case wxFONTENCODING_ISO8859_9 :
2063             enc = kCFStringEncodingISOLatin5;
2064             break ;
2065         case wxFONTENCODING_ISO8859_10 :
2066             enc = kCFStringEncodingISOLatin6;
2067             break ;
2068         case wxFONTENCODING_ISO8859_11 :
2069             enc = kCFStringEncodingISOLatinThai;
2070             break ;
2071         case wxFONTENCODING_ISO8859_13 :
2072             enc = kCFStringEncodingISOLatin7;
2073             break ;
2074         case wxFONTENCODING_ISO8859_14 :
2075             enc = kCFStringEncodingISOLatin8;
2076             break ;
2077         case wxFONTENCODING_ISO8859_15 :
2078             enc = kCFStringEncodingISOLatin9;
2079             break ;
2080
2081         case wxFONTENCODING_KOI8 :
2082             enc = kCFStringEncodingKOI8_R;
2083             break ;
2084         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2085             enc = kCFStringEncodingDOSRussian;
2086             break ;
2087
2088 //      case wxFONTENCODING_BULGARIAN :
2089 //          enc = ;
2090 //          break ;
2091
2092         case wxFONTENCODING_CP437 :
2093             enc =kCFStringEncodingDOSLatinUS ;
2094             break ;
2095         case wxFONTENCODING_CP850 :
2096             enc = kCFStringEncodingDOSLatin1;
2097             break ;
2098         case wxFONTENCODING_CP852 :
2099             enc = kCFStringEncodingDOSLatin2;
2100             break ;
2101         case wxFONTENCODING_CP855 :
2102             enc = kCFStringEncodingDOSCyrillic;
2103             break ;
2104         case wxFONTENCODING_CP866 :
2105             enc =kCFStringEncodingDOSRussian ;
2106             break ;
2107         case wxFONTENCODING_CP874 :
2108             enc = kCFStringEncodingDOSThai;
2109             break ;
2110         case wxFONTENCODING_CP932 :
2111             enc = kCFStringEncodingDOSJapanese;
2112             break ;
2113         case wxFONTENCODING_CP936 :
2114             enc =kCFStringEncodingDOSChineseSimplif ;
2115             break ;
2116         case wxFONTENCODING_CP949 :
2117             enc = kCFStringEncodingDOSKorean;
2118             break ;
2119         case wxFONTENCODING_CP950 :
2120             enc = kCFStringEncodingDOSChineseTrad;
2121             break ;
2122         case wxFONTENCODING_CP1250 :
2123             enc = kCFStringEncodingWindowsLatin2;
2124             break ;
2125         case wxFONTENCODING_CP1251 :
2126             enc =kCFStringEncodingWindowsCyrillic ;
2127             break ;
2128         case wxFONTENCODING_CP1252 :
2129             enc =kCFStringEncodingWindowsLatin1 ;
2130             break ;
2131         case wxFONTENCODING_CP1253 :
2132             enc = kCFStringEncodingWindowsGreek;
2133             break ;
2134         case wxFONTENCODING_CP1254 :
2135             enc = kCFStringEncodingWindowsLatin5;
2136             break ;
2137         case wxFONTENCODING_CP1255 :
2138             enc =kCFStringEncodingWindowsHebrew ;
2139             break ;
2140         case wxFONTENCODING_CP1256 :
2141             enc =kCFStringEncodingWindowsArabic ;
2142             break ;
2143         case wxFONTENCODING_CP1257 :
2144             enc = kCFStringEncodingWindowsBalticRim;
2145             break ;
2146 //   This only really encodes to UTF7 (if that) evidently
2147 //        case wxFONTENCODING_UTF7 :
2148 //            enc = kCFStringEncodingNonLossyASCII ;
2149 //            break ;
2150         case wxFONTENCODING_UTF8 :
2151             enc = kCFStringEncodingUTF8 ;
2152             break ;
2153         case wxFONTENCODING_EUC_JP :
2154             enc = kCFStringEncodingEUC_JP;
2155             break ;
2156         case wxFONTENCODING_UTF16 :
2157             enc = kCFStringEncodingUnicode ;
2158             break ;
2159         case wxFONTENCODING_MACROMAN :
2160             enc = kCFStringEncodingMacRoman ;
2161             break ;
2162         case wxFONTENCODING_MACJAPANESE :
2163             enc = kCFStringEncodingMacJapanese ;
2164             break ;
2165         case wxFONTENCODING_MACCHINESETRAD :
2166             enc = kCFStringEncodingMacChineseTrad ;
2167             break ;
2168         case wxFONTENCODING_MACKOREAN :
2169             enc = kCFStringEncodingMacKorean ;
2170             break ;
2171         case wxFONTENCODING_MACARABIC :
2172             enc = kCFStringEncodingMacArabic ;
2173             break ;
2174         case wxFONTENCODING_MACHEBREW :
2175             enc = kCFStringEncodingMacHebrew ;
2176             break ;
2177         case wxFONTENCODING_MACGREEK :
2178             enc = kCFStringEncodingMacGreek ;
2179             break ;
2180         case wxFONTENCODING_MACCYRILLIC :
2181             enc = kCFStringEncodingMacCyrillic ;
2182             break ;
2183         case wxFONTENCODING_MACDEVANAGARI :
2184             enc = kCFStringEncodingMacDevanagari ;
2185             break ;
2186         case wxFONTENCODING_MACGURMUKHI :
2187             enc = kCFStringEncodingMacGurmukhi ;
2188             break ;
2189         case wxFONTENCODING_MACGUJARATI :
2190             enc = kCFStringEncodingMacGujarati ;
2191             break ;
2192         case wxFONTENCODING_MACORIYA :
2193             enc = kCFStringEncodingMacOriya ;
2194             break ;
2195         case wxFONTENCODING_MACBENGALI :
2196             enc = kCFStringEncodingMacBengali ;
2197             break ;
2198         case wxFONTENCODING_MACTAMIL :
2199             enc = kCFStringEncodingMacTamil ;
2200             break ;
2201         case wxFONTENCODING_MACTELUGU :
2202             enc = kCFStringEncodingMacTelugu ;
2203             break ;
2204         case wxFONTENCODING_MACKANNADA :
2205             enc = kCFStringEncodingMacKannada ;
2206             break ;
2207         case wxFONTENCODING_MACMALAJALAM :
2208             enc = kCFStringEncodingMacMalayalam ;
2209             break ;
2210         case wxFONTENCODING_MACSINHALESE :
2211             enc = kCFStringEncodingMacSinhalese ;
2212             break ;
2213         case wxFONTENCODING_MACBURMESE :
2214             enc = kCFStringEncodingMacBurmese ;
2215             break ;
2216         case wxFONTENCODING_MACKHMER :
2217             enc = kCFStringEncodingMacKhmer ;
2218             break ;
2219         case wxFONTENCODING_MACTHAI :
2220             enc = kCFStringEncodingMacThai ;
2221             break ;
2222         case wxFONTENCODING_MACLAOTIAN :
2223             enc = kCFStringEncodingMacLaotian ;
2224             break ;
2225         case wxFONTENCODING_MACGEORGIAN :
2226             enc = kCFStringEncodingMacGeorgian ;
2227             break ;
2228         case wxFONTENCODING_MACARMENIAN :
2229             enc = kCFStringEncodingMacArmenian ;
2230             break ;
2231         case wxFONTENCODING_MACCHINESESIMP :
2232             enc = kCFStringEncodingMacChineseSimp ;
2233             break ;
2234         case wxFONTENCODING_MACTIBETAN :
2235             enc = kCFStringEncodingMacTibetan ;
2236             break ;
2237         case wxFONTENCODING_MACMONGOLIAN :
2238             enc = kCFStringEncodingMacMongolian ;
2239             break ;
2240         case wxFONTENCODING_MACETHIOPIC :
2241             enc = kCFStringEncodingMacEthiopic ;
2242             break ;
2243         case wxFONTENCODING_MACCENTRALEUR :
2244             enc = kCFStringEncodingMacCentralEurRoman ;
2245             break ;
2246         case wxFONTENCODING_MACVIATNAMESE :
2247             enc = kCFStringEncodingMacVietnamese ;
2248             break ;
2249         case wxFONTENCODING_MACARABICEXT :
2250             enc = kCFStringEncodingMacExtArabic ;
2251             break ;
2252         case wxFONTENCODING_MACSYMBOL :
2253             enc = kCFStringEncodingMacSymbol ;
2254             break ;
2255         case wxFONTENCODING_MACDINGBATS :
2256             enc = kCFStringEncodingMacDingbats ;
2257             break ;
2258         case wxFONTENCODING_MACTURKISH :
2259             enc = kCFStringEncodingMacTurkish ;
2260             break ;
2261         case wxFONTENCODING_MACCROATIAN :
2262             enc = kCFStringEncodingMacCroatian ;
2263             break ;
2264         case wxFONTENCODING_MACICELANDIC :
2265             enc = kCFStringEncodingMacIcelandic ;
2266             break ;
2267         case wxFONTENCODING_MACROMANIAN :
2268             enc = kCFStringEncodingMacRomanian ;
2269             break ;
2270         case wxFONTENCODING_MACCELTIC :
2271             enc = kCFStringEncodingMacCeltic ;
2272             break ;
2273         case wxFONTENCODING_MACGAELIC :
2274             enc = kCFStringEncodingMacGaelic ;
2275             break ;
2276 //      case wxFONTENCODING_MACKEYBOARD :
2277 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2278 //          break ;
2279         default :
2280             // because gcc is picky
2281             break ;
2282     } ;
2283     return enc ;
2284 }
2285
2286 class wxMBConv_cocoa : public wxMBConv
2287 {
2288 public:
2289     wxMBConv_cocoa()
2290     {
2291         Init(CFStringGetSystemEncoding()) ;
2292     }
2293
2294 #if wxUSE_FONTMAP
2295     wxMBConv_cocoa(const wxChar* name)
2296     {
2297         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2298     }
2299 #endif
2300
2301     wxMBConv_cocoa(wxFontEncoding encoding)
2302     {
2303         Init( wxCFStringEncFromFontEnc(encoding) );
2304     }
2305
2306     ~wxMBConv_cocoa()
2307     {
2308     }
2309
2310     void Init( CFStringEncoding encoding)
2311     {
2312         m_encoding = encoding ;
2313     }
2314
2315     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2316     {
2317         wxASSERT(szUnConv);
2318
2319         CFStringRef theString = CFStringCreateWithBytes (
2320                                                 NULL, //the allocator
2321                                                 (const UInt8*)szUnConv,
2322                                                 strlen(szUnConv),
2323                                                 m_encoding,
2324                                                 false //no BOM/external representation
2325                                                 );
2326
2327         wxASSERT(theString);
2328
2329         size_t nOutLength = CFStringGetLength(theString);
2330
2331         if (szOut == NULL)
2332         {
2333             CFRelease(theString);
2334             return nOutLength;
2335         }
2336
2337         CFRange theRange = { 0, nOutSize };
2338
2339 #if SIZEOF_WCHAR_T == 4
2340         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2341 #endif
2342
2343         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2344
2345         CFRelease(theString);
2346
2347         szUniCharBuffer[nOutLength] = '\0' ;
2348
2349 #if SIZEOF_WCHAR_T == 4
2350         wxMBConvUTF16 converter ;
2351         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2352         delete[] szUniCharBuffer;
2353 #endif
2354
2355         return nOutLength;
2356     }
2357
2358     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2359     {
2360         wxASSERT(szUnConv);
2361
2362         size_t nRealOutSize;
2363         size_t nBufSize = wxWcslen(szUnConv);
2364         UniChar* szUniBuffer = (UniChar*) szUnConv;
2365
2366 #if SIZEOF_WCHAR_T == 4
2367         wxMBConvUTF16 converter ;
2368         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2369         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2370         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2371         nBufSize /= sizeof(UniChar);
2372 #endif
2373
2374         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2375                                 NULL, //allocator
2376                                 szUniBuffer,
2377                                 nBufSize,
2378                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2379                             );
2380
2381         wxASSERT(theString);
2382
2383         //Note that CER puts a BOM when converting to unicode
2384         //so we  check and use getchars instead in that case
2385         if (m_encoding == kCFStringEncodingUnicode)
2386         {
2387             if (szOut != NULL)
2388                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2389
2390             nRealOutSize = CFStringGetLength(theString) + 1;
2391         }
2392         else
2393         {
2394             CFStringGetBytes(
2395                 theString,
2396                 CFRangeMake(0, CFStringGetLength(theString)),
2397                 m_encoding,
2398                 0, //what to put in characters that can't be converted -
2399                     //0 tells CFString to return NULL if it meets such a character
2400                 false, //not an external representation
2401                 (UInt8*) szOut,
2402                 nOutSize,
2403                 (CFIndex*) &nRealOutSize
2404                         );
2405         }
2406
2407         CFRelease(theString);
2408
2409 #if SIZEOF_WCHAR_T == 4
2410         delete[] szUniBuffer;
2411 #endif
2412
2413         return  nRealOutSize - 1;
2414     }
2415
2416     bool IsOk() const
2417     {
2418         return m_encoding != kCFStringEncodingInvalidId &&
2419               CFStringIsEncodingAvailable(m_encoding);
2420     }
2421
2422 private:
2423     CFStringEncoding m_encoding ;
2424 };
2425
2426 #endif // defined(__WXCOCOA__)
2427
2428 // ============================================================================
2429 // Mac conversion classes
2430 // ============================================================================
2431
2432 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2433
2434 class wxMBConv_mac : public wxMBConv
2435 {
2436 public:
2437     wxMBConv_mac()
2438     {
2439         Init(CFStringGetSystemEncoding()) ;
2440     }
2441
2442 #if wxUSE_FONTMAP
2443     wxMBConv_mac(const wxChar* name)
2444     {
2445         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2446     }
2447 #endif
2448
2449     wxMBConv_mac(wxFontEncoding encoding)
2450     {
2451         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2452     }
2453
2454     ~wxMBConv_mac()
2455     {
2456         OSStatus status = noErr ;
2457         status = TECDisposeConverter(m_MB2WC_converter);
2458         status = TECDisposeConverter(m_WC2MB_converter);
2459     }
2460
2461
2462     void Init( TextEncodingBase encoding)
2463     {
2464         OSStatus status = noErr ;
2465         m_char_encoding = encoding ;
2466         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2467
2468         status = TECCreateConverter(&m_MB2WC_converter,
2469                                     m_char_encoding,
2470                                     m_unicode_encoding);
2471         status = TECCreateConverter(&m_WC2MB_converter,
2472                                     m_unicode_encoding,
2473                                     m_char_encoding);
2474     }
2475
2476     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2477     {
2478         OSStatus status = noErr ;
2479         ByteCount byteOutLen ;
2480         ByteCount byteInLen = strlen(psz) ;
2481         wchar_t *tbuf = NULL ;
2482         UniChar* ubuf = NULL ;
2483         size_t res = 0 ;
2484
2485         if (buf == NULL)
2486         {
2487             //apple specs say at least 32
2488             n = wxMax( 32 , byteInLen ) ;
2489             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2490         }
2491         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2492 #if SIZEOF_WCHAR_T == 4
2493         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2494 #else
2495         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2496 #endif
2497         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2498           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2499 #if SIZEOF_WCHAR_T == 4
2500         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2501         // is not properly terminated we get random characters at the end
2502         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2503         wxMBConvUTF16 converter ;
2504         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2505         free( ubuf ) ;
2506 #else
2507         res = byteOutLen / sizeof( UniChar ) ;
2508 #endif
2509         if ( buf == NULL )
2510              free(tbuf) ;
2511
2512         if ( buf  && res < n)
2513             buf[res] = 0;
2514
2515         return res ;
2516     }
2517
2518     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2519     {
2520         OSStatus status = noErr ;
2521         ByteCount byteOutLen ;
2522         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2523
2524         char *tbuf = NULL ;
2525
2526         if (buf == NULL)
2527         {
2528             //apple specs say at least 32
2529             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2530             tbuf = (char*) malloc( n ) ;
2531         }
2532
2533         ByteCount byteBufferLen = n ;
2534         UniChar* ubuf = NULL ;
2535 #if SIZEOF_WCHAR_T == 4
2536         wxMBConvUTF16 converter ;
2537         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2538         byteInLen = unicharlen ;
2539         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2540         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2541 #else
2542         ubuf = (UniChar*) psz ;
2543 #endif
2544         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2545             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2546 #if SIZEOF_WCHAR_T == 4
2547         free( ubuf ) ;
2548 #endif
2549         if ( buf == NULL )
2550             free(tbuf) ;
2551
2552         size_t res = byteOutLen ;
2553         if ( buf  && res < n)
2554         {
2555             buf[res] = 0;
2556
2557             //we need to double-trip to verify it didn't insert any ? in place
2558             //of bogus characters
2559             wxWCharBuffer wcBuf(n);
2560             size_t pszlen = wxWcslen(psz);
2561             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2562                         wxWcslen(wcBuf) != pszlen ||
2563                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2564             {
2565                 // we didn't obtain the same thing we started from, hence
2566                 // the conversion was lossy and we consider that it failed
2567                 return (size_t)-1;
2568             }
2569         }
2570
2571         return res ;
2572     }
2573
2574     bool IsOk() const
2575         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2576
2577 private:
2578     TECObjectRef m_MB2WC_converter ;
2579     TECObjectRef m_WC2MB_converter ;
2580
2581     TextEncodingBase m_char_encoding ;
2582     TextEncodingBase m_unicode_encoding ;
2583 };
2584
2585 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2586
2587 // ============================================================================
2588 // wxEncodingConverter based conversion classes
2589 // ============================================================================
2590
2591 #if wxUSE_FONTMAP
2592
2593 class wxMBConv_wxwin : public wxMBConv
2594 {
2595 private:
2596     void Init()
2597     {
2598         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2599                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2600     }
2601
2602 public:
2603     // temporarily just use wxEncodingConverter stuff,
2604     // so that it works while a better implementation is built
2605     wxMBConv_wxwin(const wxChar* name)
2606     {
2607         if (name)
2608             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2609         else
2610             m_enc = wxFONTENCODING_SYSTEM;
2611
2612         Init();
2613     }
2614
2615     wxMBConv_wxwin(wxFontEncoding enc)
2616     {
2617         m_enc = enc;
2618
2619         Init();
2620     }
2621
2622     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2623     {
2624         size_t inbuf = strlen(psz);
2625         if (buf)
2626         {
2627             if (!m2w.Convert(psz,buf))
2628                 return (size_t)-1;
2629         }
2630         return inbuf;
2631     }
2632
2633     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2634     {
2635         const size_t inbuf = wxWcslen(psz);
2636         if (buf)
2637         {
2638             if (!w2m.Convert(psz,buf))
2639                 return (size_t)-1;
2640         }
2641
2642         return inbuf;
2643     }
2644
2645     bool IsOk() const { return m_ok; }
2646
2647 public:
2648     wxFontEncoding m_enc;
2649     wxEncodingConverter m2w, w2m;
2650
2651 private:
2652     virtual size_t GetMinMBCharWidth() const
2653     {
2654         switch ( m_enc )
2655         {
2656             case wxFONTENCODING_UTF16BE:
2657             case wxFONTENCODING_UTF16LE:
2658                 return 2;
2659
2660             case wxFONTENCODING_UTF32BE:
2661             case wxFONTENCODING_UTF32LE:
2662                 return 4;
2663
2664             default:
2665                 return 1;
2666         }
2667     }
2668
2669     // were we initialized successfully?
2670     bool m_ok;
2671
2672     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2673 };
2674
2675 // make the constructors available for unit testing
2676 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2677 {
2678     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2679     if ( !result->IsOk() )
2680     {
2681         delete result;
2682         return 0;
2683     }
2684     return result;
2685 }
2686
2687 #endif // wxUSE_FONTMAP
2688
2689 // ============================================================================
2690 // wxCSConv implementation
2691 // ============================================================================
2692
2693 void wxCSConv::Init()
2694 {
2695     m_name = NULL;
2696     m_convReal =  NULL;
2697     m_deferred = true;
2698 }
2699
2700 wxCSConv::wxCSConv(const wxChar *charset)
2701 {
2702     Init();
2703
2704     if ( charset )
2705     {
2706         SetName(charset);
2707     }
2708
2709 #if wxUSE_FONTMAP
2710     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2711 #else
2712     m_encoding = wxFONTENCODING_SYSTEM;
2713 #endif
2714 }
2715
2716 wxCSConv::wxCSConv(wxFontEncoding encoding)
2717 {
2718     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2719     {
2720         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2721
2722         encoding = wxFONTENCODING_SYSTEM;
2723     }
2724
2725     Init();
2726
2727     m_encoding = encoding;
2728 }
2729
2730 wxCSConv::~wxCSConv()
2731 {
2732     Clear();
2733 }
2734
2735 wxCSConv::wxCSConv(const wxCSConv& conv)
2736         : wxMBConv()
2737 {
2738     Init();
2739
2740     SetName(conv.m_name);
2741     m_encoding = conv.m_encoding;
2742 }
2743
2744 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2745 {
2746     Clear();
2747
2748     SetName(conv.m_name);
2749     m_encoding = conv.m_encoding;
2750
2751     return *this;
2752 }
2753
2754 void wxCSConv::Clear()
2755 {
2756     free(m_name);
2757     delete m_convReal;
2758
2759     m_name = NULL;
2760     m_convReal = NULL;
2761 }
2762
2763 void wxCSConv::SetName(const wxChar *charset)
2764 {
2765     if (charset)
2766     {
2767         m_name = wxStrdup(charset);
2768         m_deferred = true;
2769     }
2770 }
2771
2772 #if wxUSE_FONTMAP
2773 #include "wx/hashmap.h"
2774
2775 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2776                      wxEncodingNameCache );
2777
2778 static wxEncodingNameCache gs_nameCache;
2779 #endif
2780
2781 wxMBConv *wxCSConv::DoCreate() const
2782 {
2783 #if wxUSE_FONTMAP
2784     wxLogTrace(TRACE_STRCONV,
2785                wxT("creating conversion for %s"),
2786                (m_name ? m_name
2787                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2788 #endif // wxUSE_FONTMAP
2789
2790     // check for the special case of ASCII or ISO8859-1 charset: as we have
2791     // special knowledge of it anyhow, we don't need to create a special
2792     // conversion object
2793     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2794             m_encoding == wxFONTENCODING_DEFAULT )
2795     {
2796         // don't convert at all
2797         return NULL;
2798     }
2799
2800     // we trust OS to do conversion better than we can so try external
2801     // conversion methods first
2802     //
2803     // the full order is:
2804     //      1. OS conversion (iconv() under Unix or Win32 API)
2805     //      2. hard coded conversions for UTF
2806     //      3. wxEncodingConverter as fall back
2807
2808     // step (1)
2809 #ifdef HAVE_ICONV
2810 #if !wxUSE_FONTMAP
2811     if ( m_name )
2812 #endif // !wxUSE_FONTMAP
2813     {
2814         wxString name(m_name);
2815         wxFontEncoding encoding(m_encoding);
2816
2817         if ( !name.empty() )
2818         {
2819             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2820             if ( conv->IsOk() )
2821                 return conv;
2822
2823             delete conv;
2824
2825 #if wxUSE_FONTMAP
2826             encoding =
2827                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2828 #endif // wxUSE_FONTMAP
2829         }
2830 #if wxUSE_FONTMAP
2831         {
2832             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2833             if ( it != gs_nameCache.end() )
2834             {
2835                 if ( it->second.empty() )
2836                     return NULL;
2837
2838                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2839                 if ( conv->IsOk() )
2840                     return conv;
2841
2842                 delete conv;
2843             }
2844
2845             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2846
2847             for ( ; *names; ++names )
2848             {
2849                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2850                 if ( conv->IsOk() )
2851                 {
2852                     gs_nameCache[encoding] = *names;
2853                     return conv;
2854                 }
2855
2856                 delete conv;
2857             }
2858
2859             gs_nameCache[encoding] = _T(""); // cache the failure
2860         }
2861 #endif // wxUSE_FONTMAP
2862     }
2863 #endif // HAVE_ICONV
2864
2865 #ifdef wxHAVE_WIN32_MB2WC
2866     {
2867 #if wxUSE_FONTMAP
2868         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2869                                       : new wxMBConv_win32(m_encoding);
2870         if ( conv->IsOk() )
2871             return conv;
2872
2873         delete conv;
2874 #else
2875         return NULL;
2876 #endif
2877     }
2878 #endif // wxHAVE_WIN32_MB2WC
2879 #if defined(__WXMAC__)
2880     {
2881         // leave UTF16 and UTF32 to the built-ins of wx
2882         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2883             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2884         {
2885
2886 #if wxUSE_FONTMAP
2887             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2888                                         : new wxMBConv_mac(m_encoding);
2889 #else
2890             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2891 #endif
2892             if ( conv->IsOk() )
2893                  return conv;
2894
2895             delete conv;
2896         }
2897     }
2898 #endif
2899 #if defined(__WXCOCOA__)
2900     {
2901         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2902         {
2903
2904 #if wxUSE_FONTMAP
2905             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2906                                           : new wxMBConv_cocoa(m_encoding);
2907 #else
2908             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2909 #endif
2910             if ( conv->IsOk() )
2911                  return conv;
2912
2913             delete conv;
2914         }
2915     }
2916 #endif
2917     // step (2)
2918     wxFontEncoding enc = m_encoding;
2919 #if wxUSE_FONTMAP
2920     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2921     {
2922         // use "false" to suppress interactive dialogs -- we can be called from
2923         // anywhere and popping up a dialog from here is the last thing we want to
2924         // do
2925         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2926     }
2927 #endif // wxUSE_FONTMAP
2928
2929     switch ( enc )
2930     {
2931         case wxFONTENCODING_UTF7:
2932              return new wxMBConvUTF7;
2933
2934         case wxFONTENCODING_UTF8:
2935              return new wxMBConvUTF8;
2936
2937         case wxFONTENCODING_UTF16BE:
2938              return new wxMBConvUTF16BE;
2939
2940         case wxFONTENCODING_UTF16LE:
2941              return new wxMBConvUTF16LE;
2942
2943         case wxFONTENCODING_UTF32BE:
2944              return new wxMBConvUTF32BE;
2945
2946         case wxFONTENCODING_UTF32LE:
2947              return new wxMBConvUTF32LE;
2948
2949         default:
2950              // nothing to do but put here to suppress gcc warnings
2951              ;
2952     }
2953
2954     // step (3)
2955 #if wxUSE_FONTMAP
2956     {
2957         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2958                                       : new wxMBConv_wxwin(m_encoding);
2959         if ( conv->IsOk() )
2960             return conv;
2961
2962         delete conv;
2963     }
2964 #endif // wxUSE_FONTMAP
2965
2966     // NB: This is a hack to prevent deadlock. What could otherwise happen
2967     //     in Unicode build: wxConvLocal creation ends up being here
2968     //     because of some failure and logs the error. But wxLog will try to
2969     //     attach timestamp, for which it will need wxConvLocal (to convert
2970     //     time to char* and then wchar_t*), but that fails, tries to log
2971     //     error, but wxLog has a (already locked) critical section that
2972     //     guards static buffer.
2973     static bool alreadyLoggingError = false;
2974     if (!alreadyLoggingError)
2975     {
2976         alreadyLoggingError = true;
2977         wxLogError(_("Cannot convert from the charset '%s'!"),
2978                    m_name ? m_name
2979                       :
2980 #if wxUSE_FONTMAP
2981                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2982 #else // !wxUSE_FONTMAP
2983                          wxString::Format(_("encoding %s"), m_encoding).c_str()
2984 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2985               );
2986         alreadyLoggingError = false;
2987     }
2988
2989     return NULL;
2990 }
2991
2992 void wxCSConv::CreateConvIfNeeded() const
2993 {
2994     if ( m_deferred )
2995     {
2996         wxCSConv *self = (wxCSConv *)this; // const_cast
2997
2998 #if wxUSE_INTL
2999         // if we don't have neither the name nor the encoding, use the default
3000         // encoding for this system
3001         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3002         {
3003             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3004         }
3005 #endif // wxUSE_INTL
3006
3007         self->m_convReal = DoCreate();
3008         self->m_deferred = false;
3009     }
3010 }
3011
3012 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3013 {
3014     CreateConvIfNeeded();
3015
3016     if (m_convReal)
3017         return m_convReal->MB2WC(buf, psz, n);
3018
3019     // latin-1 (direct)
3020     size_t len = strlen(psz);
3021
3022     if (buf)
3023     {
3024         for (size_t c = 0; c <= len; c++)
3025             buf[c] = (unsigned char)(psz[c]);
3026     }
3027
3028     return len;
3029 }
3030
3031 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3032 {
3033     CreateConvIfNeeded();
3034
3035     if (m_convReal)
3036         return m_convReal->WC2MB(buf, psz, n);
3037
3038     // latin-1 (direct)
3039     const size_t len = wxWcslen(psz);
3040     if (buf)
3041     {
3042         for (size_t c = 0; c <= len; c++)
3043         {
3044             if (psz[c] > 0xFF)
3045                 return (size_t)-1;
3046             buf[c] = (char)psz[c];
3047         }
3048     }
3049     else
3050     {
3051         for (size_t c = 0; c <= len; c++)
3052         {
3053             if (psz[c] > 0xFF)
3054                 return (size_t)-1;
3055         }
3056     }
3057
3058     return len;
3059 }
3060
3061 size_t wxCSConv::GetMinMBCharWidth() const
3062 {
3063     CreateConvIfNeeded();
3064
3065     if ( m_convReal )
3066     {
3067         // cast needed just to call private function of m_convReal
3068         return ((wxCSConv *)m_convReal)->GetMinMBCharWidth();
3069     }
3070
3071     return 1;
3072 }
3073
3074 // ----------------------------------------------------------------------------
3075 // globals
3076 // ----------------------------------------------------------------------------
3077
3078 #ifdef __WINDOWS__
3079     static wxMBConv_win32 wxConvLibcObj;
3080 #elif defined(__WXMAC__) && !defined(__MACH__)
3081     static wxMBConv_mac wxConvLibcObj ;
3082 #else
3083     static wxMBConvLibc wxConvLibcObj;
3084 #endif
3085
3086 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3087 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3088 static wxMBConvUTF7 wxConvUTF7Obj;
3089 static wxMBConvUTF8 wxConvUTF8Obj;
3090
3091 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3092 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3093 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3094 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3095 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3096 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3097 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3098 #ifdef __WXOSX__
3099                                     wxConvUTF8Obj;
3100 #else
3101                                     wxConvLibcObj;
3102 #endif
3103
3104
3105 #else // !wxUSE_WCHAR_T
3106
3107 // stand-ins in absence of wchar_t
3108 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3109                                 wxConvISO8859_1,
3110                                 wxConvLocal,
3111                                 wxConvUTF8;
3112
3113 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T