src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  90 static bool NotAllNULs(const char *p, size_t n)
  91 {
  92     while ( n && *p++ == '\0' )
  93         n--;
  94
  95     return n != 0;
  96 }
  97
  98 // ----------------------------------------------------------------------------
  99 // UTF-16 en/decoding to/from UCS-4
 100 // ----------------------------------------------------------------------------
 101
 102
 103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 104 {
 105     if (input<=0xffff)
 106     {
 107         if (output)
 108             *output = (wxUint16) input;
 109         return 1;
 110     }
 111     else if (input>=0x110000)
 112     {
 113         return (size_t)-1;
 114     }
 115     else
 116     {
 117         if (output)
 118         {
 119             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 120             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 121         }
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input<0xd800) || (*input>0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 134     {
 135         output = *input;
 136         return (size_t)-1;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145
 146 // ----------------------------------------------------------------------------
 147 // wxMBConv
 148 // ----------------------------------------------------------------------------
 149
 150 size_t
 151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 152                   const char *src, size_t srcLen) const
 153 {
 154     // although new conversion classes are supposed to implement this function
 155     // directly, the existins ones only implement the old MB2WC() and so, to
 156     // avoid to have to rewrite all conversion classes at once, we provide a
 157     // default (but not efficient) implementation of this one in terms of the
 158     // old function by copying the input to ensure that it's NUL-terminated and
 159     // then using MB2WC() to convert it
 160
 161     // the number of chars [which would be] written to dst [if it were not NULL]
 162     size_t dstWritten = 0;
 163
 164     // the number of NULs terminating this string
 165     size_t nulLen wxDUMMY_INITIALIZE(0);
 166
 167     // if we were not given the input size we just have to assume that the
 168     // string is properly terminated as we have no way of knowing how long it
 169     // is anyhow, but if we do have the size check whether there are enough
 170     // NULs at the end
 171     wxCharBuffer bufTmp;
 172     const char *srcEnd;
 173     if ( srcLen != (size_t)-1 )
 174     {
 175         // we need to know how to find the end of this string
 176         nulLen = GetMBNulLen();
 177         if ( nulLen == wxCONV_FAILED )
 178             return wxCONV_FAILED;
 179
 180         // if there are enough NULs we can avoid the copy
 181         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 182         {
 183             // make a copy in order to properly NUL-terminate the string
 184             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 185             char * const p = bufTmp.data();
 186             memcpy(p, src, srcLen);
 187             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 188                 *s = '\0';
 189
 190             src = bufTmp;
 191         }
 192
 193         srcEnd = src + srcLen;
 194     }
 195     else // quit after the first loop iteration
 196     {
 197         srcEnd = NULL;
 198     }
 199
 200     for ( ;; )
 201     {
 202         // try to convert the current chunk
 203         size_t lenChunk = MB2WC(NULL, src, 0);
 204         if ( lenChunk == 0 )
 205         {
 206             // nothing left in the input string, conversion succeeded; but
 207             // still account for the trailing NULL
 208             dstWritten++;
 209             break;
 210         }
 211
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for trailing NUL
 216
 217         dstWritten += lenChunk;
 218
 219         if ( dst )
 220         {
 221             if ( dstWritten > dstLen )
 222                 return wxCONV_FAILED;
 223
 224             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 225                 return wxCONV_FAILED;
 226
 227             dst += lenChunk;
 228         }
 229
 230         if ( !srcEnd )
 231         {
 232             // we convert the entire string in this cas, as we suppose that the
 233             // string is NUL-terminated and so srcEnd is not used at all
 234             break;
 235         }
 236
 237         // advance the input pointer past the end of this chunk
 238         while ( NotAllNULs(src, nulLen) )
 239         {
 240             // notice that we must skip over multiple bytes here as we suppose
 241             // that if NUL takes 2 or 4 bytes, then all the other characters do
 242             // too and so if advanced by a single byte we might erroneously
 243             // detect sequences of NUL bytes in the middle of the input
 244             src += nulLen;
 245         }
 246
 247         src += nulLen; // skipping over its terminator as well
 248
 249         // note that ">=" (and not just "==") is needed here as the terminator
 250         // we skipped just above could be inside or just after the buffer
 251         // delimited by inEnd
 252         if ( src >= srcEnd )
 253             break;
 254     }
 255
 256     return dstWritten;
 257 }
 258
 259 size_t
 260 wxMBConv::FromWChar(char *dst, size_t dstLen,
 261                     const wchar_t *src, size_t srcLen) const
 262 {
 263     // the number of chars [which would be] written to dst [if it were not NULL]
 264     size_t dstWritten = 0;
 265
 266     // make a copy of the input string unless it is already properly
 267     // NUL-terminated
 268     //
 269     // if we don't know its length we have no choice but to assume that it is,
 270     // indeed, properly terminated
 271     wxWCharBuffer bufTmp;
 272     if ( srcLen == (size_t)-1 )
 273     {
 274         srcLen = wxWcslen(src) + 1;
 275     }
 276     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 277     {
 278         // make a copy in order to properly NUL-terminate the string
 279         bufTmp = wxWCharBuffer(srcLen);
 280         memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
 281         src = bufTmp;
 282     }
 283
 284     const size_t lenNul = GetMBNulLen();
 285     for ( const wchar_t * const srcEnd = src + srcLen;
 286           src < srcEnd;
 287           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 288     {
 289         // try to convert the current chunk
 290         size_t lenChunk = WC2MB(NULL, src, 0);
 291
 292         if ( lenChunk == wxCONV_FAILED )
 293             return wxCONV_FAILED;
 294
 295         lenChunk += lenNul;
 296         dstWritten += lenChunk;
 297
 298         if ( dst )
 299         {
 300             if ( dstWritten > dstLen )
 301                 return wxCONV_FAILED;
 302
 303             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 304                 return wxCONV_FAILED;
 305
 306             dst += lenChunk;
 307         }
 308     }
 309
 310     return dstWritten;
 311 }
 312
 313 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
 314 {
 315     size_t rc = ToWChar(out, outLen, in);
 316     if ( rc != wxCONV_FAILED )
 317     {
 318         // ToWChar() returns the buffer length, i.e. including the trailing
 319         // NUL, while this method doesn't take it into account
 320         rc--;
 321     }
 322
 323     return rc;
 324 }
 325
 326 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
 327 {
 328     size_t rc = FromWChar(out, outLen, in);
 329     if ( rc != wxCONV_FAILED )
 330     {
 331         rc -= GetMBNulLen();
 332     }
 333
 334     return rc;
 335 }
 336
 337 wxMBConv::~wxMBConv()
 338 {
 339     // nothing to do here (necessary for Darwin linking probably)
 340 }
 341
 342 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 343 {
 344     if ( psz )
 345     {
 346         // calculate the length of the buffer needed first
 347         const size_t nLen = MB2WC(NULL, psz, 0);
 348         if ( nLen != wxCONV_FAILED )
 349         {
 350             // now do the actual conversion
 351             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 352
 353             // +1 for the trailing NULL
 354             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 355                 return buf;
 356         }
 357     }
 358
 359     return wxWCharBuffer();
 360 }
 361
 362 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 363 {
 364     if ( pwz )
 365     {
 366         const size_t nLen = WC2MB(NULL, pwz, 0);
 367         if ( nLen != wxCONV_FAILED )
 368         {
 369             // extra space for trailing NUL(s)
 370             static const size_t extraLen = GetMaxMBNulLen();
 371
 372             wxCharBuffer buf(nLen + extraLen - 1);
 373             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 374                 return buf;
 375         }
 376     }
 377
 378     return wxCharBuffer();
 379 }
 380
 381 const wxWCharBuffer
 382 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 383 {
 384     const size_t dstLen = ToWChar(NULL, 0, in, inLen);
 385     if ( dstLen != wxCONV_FAILED )
 386     {
 387         wxWCharBuffer wbuf(dstLen - 1);
 388         if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
 389         {
 390             if ( outLen )
 391                 *outLen = dstLen - 1;
 392             return wbuf;
 393         }
 394     }
 395
 396     if ( outLen )
 397         *outLen = 0;
 398
 399     return wxWCharBuffer();
 400 }
 401
 402 const wxCharBuffer
 403 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 404 {
 405     const size_t dstLen = FromWChar(NULL, 0, in, inLen);
 406     if ( dstLen != wxCONV_FAILED )
 407     {
 408         wxCharBuffer buf(dstLen - 1);
 409         if ( FromWChar(buf.data(), dstLen, in, inLen) )
 410         {
 411             if ( outLen )
 412                 *outLen = dstLen - 1;
 413             return buf;
 414         }
 415     }
 416
 417     if ( outLen )
 418         *outLen = 0;
 419
 420     return wxCharBuffer();
 421 }
 422
 423 // ----------------------------------------------------------------------------
 424 // wxMBConvLibc
 425 // ----------------------------------------------------------------------------
 426
 427 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 428 {
 429     return wxMB2WC(buf, psz, n);
 430 }
 431
 432 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 433 {
 434     return wxWC2MB(buf, psz, n);
 435 }
 436
 437 // ----------------------------------------------------------------------------
 438 // wxConvBrokenFileNames
 439 // ----------------------------------------------------------------------------
 440
 441 #ifdef __UNIX__
 442
 443 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 444 {
 445     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 446                   || wxStricmp(charset, _T("UTF8")) == 0  )
 447         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 448     else
 449         m_conv = new wxCSConv(charset);
 450 }
 451
 452 #endif // __UNIX__
 453
 454 // ----------------------------------------------------------------------------
 455 // UTF-7
 456 // ----------------------------------------------------------------------------
 457
 458 // Implementation (C) 2004 Fredrik Roubert
 459
 460 //
 461 // BASE64 decoding table
 462 //
 463 static const unsigned char utf7unb64[] =
 464 {
 465     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 466     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 467     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 468     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 469     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 470     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 471     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 472     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 473     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 474     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 475     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 476     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 477     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 478     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 479     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 480     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 481     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 482     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 483     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 484     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 485     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 486     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 487     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 488     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 489     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 490     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 497 };
 498
 499 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 500 {
 501     size_t len = 0;
 502
 503     while ( *psz && (!buf || (len < n)) )
 504     {
 505         unsigned char cc = *psz++;
 506         if (cc != '+')
 507         {
 508             // plain ASCII char
 509             if (buf)
 510                 *buf++ = cc;
 511             len++;
 512         }
 513         else if (*psz == '-')
 514         {
 515             // encoded plus sign
 516             if (buf)
 517                 *buf++ = cc;
 518             len++;
 519             psz++;
 520         }
 521         else // start of BASE64 encoded string
 522         {
 523             bool lsb, ok;
 524             unsigned int d, l;
 525             for ( ok = lsb = false, d = 0, l = 0;
 526                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 527                   psz++ )
 528             {
 529                 d <<= 6;
 530                 d += cc;
 531                 for (l += 6; l >= 8; lsb = !lsb)
 532                 {
 533                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 534                     if (lsb)
 535                     {
 536                         if (buf)
 537                             *buf++ |= c;
 538                         len ++;
 539                     }
 540                     else
 541                     {
 542                         if (buf)
 543                             *buf = (wchar_t)(c << 8);
 544                     }
 545
 546                     ok = true;
 547                 }
 548             }
 549
 550             if ( !ok )
 551             {
 552                 // in valid UTF7 we should have valid characters after '+'
 553                 return (size_t)-1;
 554             }
 555
 556             if (*psz == '-')
 557                 psz++;
 558         }
 559     }
 560
 561     if ( buf && (len < n) )
 562         *buf = '\0';
 563
 564     return len;
 565 }
 566
 567 //
 568 // BASE64 encoding table
 569 //
 570 static const unsigned char utf7enb64[] =
 571 {
 572     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 573     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 574     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 575     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 576     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 577     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 578     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 579     '4', '5', '6', '7', '8', '9', '+', '/'
 580 };
 581
 582 //
 583 // UTF-7 encoding table
 584 //
 585 // 0 - Set D (directly encoded characters)
 586 // 1 - Set O (optional direct characters)
 587 // 2 - whitespace characters (optional)
 588 // 3 - special characters
 589 //
 590 static const unsigned char utf7encode[128] =
 591 {
 592     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 593     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 594     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 595     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 596     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 597     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 598     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 599     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 600 };
 601
 602 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 603 {
 604     size_t len = 0;
 605
 606     while (*psz && ((!buf) || (len < n)))
 607     {
 608         wchar_t cc = *psz++;
 609         if (cc < 0x80 && utf7encode[cc] < 1)
 610         {
 611             // plain ASCII char
 612             if (buf)
 613                 *buf++ = (char)cc;
 614             len++;
 615         }
 616 #ifndef WC_UTF16
 617         else if (((wxUint32)cc) > 0xffff)
 618         {
 619             // no surrogate pair generation (yet?)
 620             return (size_t)-1;
 621         }
 622 #endif
 623         else
 624         {
 625             if (buf)
 626                 *buf++ = '+';
 627             len++;
 628             if (cc != '+')
 629             {
 630                 // BASE64 encode string
 631                 unsigned int lsb, d, l;
 632                 for (d = 0, l = 0; /*nothing*/; psz++)
 633                 {
 634                     for (lsb = 0; lsb < 2; lsb ++)
 635                     {
 636                         d <<= 8;
 637                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 638
 639                         for (l += 8; l >= 6; )
 640                         {
 641                             l -= 6;
 642                             if (buf)
 643                                 *buf++ = utf7enb64[(d >> l) % 64];
 644                             len++;
 645                         }
 646                     }
 647                     cc = *psz;
 648                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 649                         break;
 650                 }
 651                 if (l != 0)
 652                 {
 653                     if (buf)
 654                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 655                     len++;
 656                 }
 657             }
 658             if (buf)
 659                 *buf++ = '-';
 660             len++;
 661         }
 662     }
 663     if (buf && (len < n))
 664         *buf = 0;
 665     return len;
 666 }
 667
 668 // ----------------------------------------------------------------------------
 669 // UTF-8
 670 // ----------------------------------------------------------------------------
 671
 672 static wxUint32 utf8_max[]=
 673     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 674
 675 // boundaries of the private use area we use to (temporarily) remap invalid
 676 // characters invalid in a UTF-8 encoded string
 677 const wxUint32 wxUnicodePUA = 0x100000;
 678 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 679
 680 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 681 {
 682     size_t len = 0;
 683
 684     while (*psz && ((!buf) || (len < n)))
 685     {
 686         const char *opsz = psz;
 687         bool invalid = false;
 688         unsigned char cc = *psz++, fc = cc;
 689         unsigned cnt;
 690         for (cnt = 0; fc & 0x80; cnt++)
 691             fc <<= 1;
 692         if (!cnt)
 693         {
 694             // plain ASCII char
 695             if (buf)
 696                 *buf++ = cc;
 697             len++;
 698
 699             // escape the escape character for octal escapes
 700             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 701                     && cc == '\\' && (!buf || len < n))
 702             {
 703                 if (buf)
 704                     *buf++ = cc;
 705                 len++;
 706             }
 707         }
 708         else
 709         {
 710             cnt--;
 711             if (!cnt)
 712             {
 713                 // invalid UTF-8 sequence
 714                 invalid = true;
 715             }
 716             else
 717             {
 718                 unsigned ocnt = cnt - 1;
 719                 wxUint32 res = cc & (0x3f >> cnt);
 720                 while (cnt--)
 721                 {
 722                     cc = *psz;
 723                     if ((cc & 0xC0) != 0x80)
 724                     {
 725                         // invalid UTF-8 sequence
 726                         invalid = true;
 727                         break;
 728                     }
 729                     psz++;
 730                     res = (res << 6) | (cc & 0x3f);
 731                 }
 732                 if (invalid || res <= utf8_max[ocnt])
 733                 {
 734                     // illegal UTF-8 encoding
 735                     invalid = true;
 736                 }
 737                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 738                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 739                 {
 740                     // if one of our PUA characters turns up externally
 741                     // it must also be treated as an illegal sequence
 742                     // (a bit like you have to escape an escape character)
 743                     invalid = true;
 744                 }
 745                 else
 746                 {
 747 #ifdef WC_UTF16
 748                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 749                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 750                     if (pa == (size_t)-1)
 751                     {
 752                         invalid = true;
 753                     }
 754                     else
 755                     {
 756                         if (buf)
 757                             buf += pa;
 758                         len += pa;
 759                     }
 760 #else // !WC_UTF16
 761                     if (buf)
 762                         *buf++ = (wchar_t)res;
 763                     len++;
 764 #endif // WC_UTF16/!WC_UTF16
 765                 }
 766             }
 767             if (invalid)
 768             {
 769                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 770                 {
 771                     while (opsz < psz && (!buf || len < n))
 772                     {
 773 #ifdef WC_UTF16
 774                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 775                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 776                         wxASSERT(pa != (size_t)-1);
 777                         if (buf)
 778                             buf += pa;
 779                         opsz++;
 780                         len += pa;
 781 #else
 782                         if (buf)
 783                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 784                         opsz++;
 785                         len++;
 786 #endif
 787                     }
 788                 }
 789                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 790                 {
 791                     while (opsz < psz && (!buf || len < n))
 792                     {
 793                         if ( buf && len + 3 < n )
 794                         {
 795                             unsigned char on = *opsz;
 796                             *buf++ = L'\\';
 797                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 798                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 799                             *buf++ = (wchar_t)( L'0' + on % 010 );
 800                         }
 801                         opsz++;
 802                         len += 4;
 803                     }
 804                 }
 805                 else // MAP_INVALID_UTF8_NOT
 806                 {
 807                     return (size_t)-1;
 808                 }
 809             }
 810         }
 811     }
 812     if (buf && (len < n))
 813         *buf = 0;
 814     return len;
 815 }
 816
 817 static inline bool isoctal(wchar_t wch)
 818 {
 819     return L'0' <= wch && wch <= L'7';
 820 }
 821
 822 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 823 {
 824     size_t len = 0;
 825
 826     while (*psz && ((!buf) || (len < n)))
 827     {
 828         wxUint32 cc;
 829 #ifdef WC_UTF16
 830         // cast is ok for WC_UTF16
 831         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 832         psz += (pa == (size_t)-1) ? 1 : pa;
 833 #else
 834         cc=(*psz++) & 0x7fffffff;
 835 #endif
 836
 837         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 838                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 839         {
 840             if (buf)
 841                 *buf++ = (char)(cc - wxUnicodePUA);
 842             len++;
 843         }
 844         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 845                     && cc == L'\\' && psz[0] == L'\\' )
 846         {
 847             if (buf)
 848                 *buf++ = (char)cc;
 849             psz++;
 850             len++;
 851         }
 852         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 853                     cc == L'\\' &&
 854                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 855         {
 856             if (buf)
 857             {
 858                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 859                                  (psz[1] - L'0')*010 +
 860                                  (psz[2] - L'0'));
 861             }
 862
 863             psz += 3;
 864             len++;
 865         }
 866         else
 867         {
 868             unsigned cnt;
 869             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 870             if (!cnt)
 871             {
 872                 // plain ASCII char
 873                 if (buf)
 874                     *buf++ = (char) cc;
 875                 len++;
 876             }
 877
 878             else
 879             {
 880                 len += cnt + 1;
 881                 if (buf)
 882                 {
 883                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 884                     while (cnt--)
 885                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 886                 }
 887             }
 888         }
 889     }
 890
 891     if (buf && (len<n))
 892         *buf = 0;
 893
 894     return len;
 895 }
 896
 897 // ----------------------------------------------------------------------------
 898 // UTF-16
 899 // ----------------------------------------------------------------------------
 900
 901 #ifdef WORDS_BIGENDIAN
 902     #define wxMBConvUTF16straight wxMBConvUTF16BE
 903     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 904 #else
 905     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 906     #define wxMBConvUTF16straight wxMBConvUTF16LE
 907 #endif
 908
 909
 910 #ifdef WC_UTF16
 911
 912 // copy 16bit MB to 16bit String
 913 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 914 {
 915     size_t len=0;
 916
 917     while (*(wxUint16*)psz && (!buf || len < n))
 918     {
 919         if (buf)
 920             *buf++ = *(wxUint16*)psz;
 921         len++;
 922
 923         psz += sizeof(wxUint16);
 924     }
 925     if (buf && len<n)   *buf=0;
 926
 927     return len;
 928 }
 929
 930
 931 // copy 16bit String to 16bit MB
 932 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 933 {
 934     size_t len=0;
 935
 936     while (*psz && (!buf || len < n))
 937     {
 938         if (buf)
 939         {
 940             *(wxUint16*)buf = *psz;
 941             buf += sizeof(wxUint16);
 942         }
 943         len += sizeof(wxUint16);
 944         psz++;
 945     }
 946     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 947
 948     return len;
 949 }
 950
 951
 952 // swap 16bit MB to 16bit String
 953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 954 {
 955     size_t len = 0;
 956
 957     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 958     // inside the string
 959     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 960     {
 961         if ( buf )
 962         {
 963             ((char *)buf)[0] = psz[1];
 964             ((char *)buf)[1] = psz[0];
 965             buf++;
 966         }
 967         len++;
 968         psz += 2;
 969     }
 970
 971     if ( buf && len < n )
 972         *buf = L'\0';
 973
 974     return len;
 975 }
 976
 977
 978 // swap 16bit MB to 16bit String
 979 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 980 {
 981     size_t len = 0;
 982
 983     while ( *psz && (!buf || len < n) )
 984     {
 985         if ( buf )
 986         {
 987             *buf++ = ((char*)psz)[1];
 988             *buf++ = ((char*)psz)[0];
 989         }
 990         len += 2;
 991         psz++;
 992     }
 993
 994     if ( buf && len < n - 1 )
 995     {
 996         buf[0] =
 997         buf[1] = '\0';
 998     }
 999
1000     return len;
1001 }
1002
1003
1004 #else // WC_UTF16
1005
1006
1007 // copy 16bit MB to 32bit String
1008 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1009 {
1010     size_t len=0;
1011
1012     while (*(wxUint16*)psz && (!buf || len < n))
1013     {
1014         wxUint32 cc;
1015         size_t pa=decode_utf16((wxUint16*)psz, cc);
1016         if (pa == (size_t)-1)
1017             return pa;
1018
1019         if (buf)
1020             *buf++ = (wchar_t)cc;
1021         len++;
1022         psz += pa * sizeof(wxUint16);
1023     }
1024     if (buf && len<n)   *buf=0;
1025
1026     return len;
1027 }
1028
1029
1030 // copy 32bit String to 16bit MB
1031 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1032 {
1033     size_t len=0;
1034
1035     while (*psz && (!buf || len < n))
1036     {
1037         wxUint16 cc[2];
1038         size_t pa=encode_utf16(*psz, cc);
1039
1040         if (pa == (size_t)-1)
1041             return pa;
1042
1043         if (buf)
1044         {
1045             *(wxUint16*)buf = cc[0];
1046             buf += sizeof(wxUint16);
1047             if (pa > 1)
1048             {
1049                 *(wxUint16*)buf = cc[1];
1050                 buf += sizeof(wxUint16);
1051             }
1052         }
1053
1054         len += pa*sizeof(wxUint16);
1055         psz++;
1056     }
1057     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1058
1059     return len;
1060 }
1061
1062
1063 // swap 16bit MB to 32bit String
1064 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1065 {
1066     size_t len=0;
1067
1068     while (*(wxUint16*)psz && (!buf || len < n))
1069     {
1070         wxUint32 cc;
1071         char tmp[4];
1072         tmp[0]=psz[1];  tmp[1]=psz[0];
1073         tmp[2]=psz[3];  tmp[3]=psz[2];
1074
1075         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1076         if (pa == (size_t)-1)
1077             return pa;
1078
1079         if (buf)
1080             *buf++ = (wchar_t)cc;
1081
1082         len++;
1083         psz += pa * sizeof(wxUint16);
1084     }
1085     if (buf && len<n)   *buf=0;
1086
1087     return len;
1088 }
1089
1090
1091 // swap 32bit String to 16bit MB
1092 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1093 {
1094     size_t len=0;
1095
1096     while (*psz && (!buf || len < n))
1097     {
1098         wxUint16 cc[2];
1099         size_t pa=encode_utf16(*psz, cc);
1100
1101         if (pa == (size_t)-1)
1102             return pa;
1103
1104         if (buf)
1105         {
1106             *buf++ = ((char*)cc)[1];
1107             *buf++ = ((char*)cc)[0];
1108             if (pa > 1)
1109             {
1110                 *buf++ = ((char*)cc)[3];
1111                 *buf++ = ((char*)cc)[2];
1112             }
1113         }
1114
1115         len += pa*sizeof(wxUint16);
1116         psz++;
1117     }
1118     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1119
1120     return len;
1121 }
1122
1123 #endif // WC_UTF16
1124
1125
1126 // ----------------------------------------------------------------------------
1127 // UTF-32
1128 // ----------------------------------------------------------------------------
1129
1130 #ifdef WORDS_BIGENDIAN
1131 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1132 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1133 #else
1134 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1135 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1136 #endif
1137
1138
1139 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1140 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1141
1142
1143 #ifdef WC_UTF16
1144
1145 // copy 32bit MB to 16bit String
1146 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1147 {
1148     size_t len=0;
1149
1150     while (*(wxUint32*)psz && (!buf || len < n))
1151     {
1152         wxUint16 cc[2];
1153
1154         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1155         if (pa == (size_t)-1)
1156             return pa;
1157
1158         if (buf)
1159         {
1160             *buf++ = cc[0];
1161             if (pa > 1)
1162                 *buf++ = cc[1];
1163         }
1164         len += pa;
1165         psz += sizeof(wxUint32);
1166     }
1167     if (buf && len<n)   *buf=0;
1168
1169     return len;
1170 }
1171
1172
1173 // copy 16bit String to 32bit MB
1174 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1175 {
1176     size_t len=0;
1177
1178     while (*psz && (!buf || len < n))
1179     {
1180         wxUint32 cc;
1181
1182         // cast is ok for WC_UTF16
1183         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1184         if (pa == (size_t)-1)
1185             return pa;
1186
1187         if (buf)
1188         {
1189             *(wxUint32*)buf = cc;
1190             buf += sizeof(wxUint32);
1191         }
1192         len += sizeof(wxUint32);
1193         psz += pa;
1194     }
1195
1196     if (buf && len<=n-sizeof(wxUint32))
1197         *(wxUint32*)buf=0;
1198
1199     return len;
1200 }
1201
1202
1203
1204 // swap 32bit MB to 16bit String
1205 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1206 {
1207     size_t len=0;
1208
1209     while (*(wxUint32*)psz && (!buf || len < n))
1210     {
1211         char tmp[4];
1212         tmp[0] = psz[3];   tmp[1] = psz[2];
1213         tmp[2] = psz[1];   tmp[3] = psz[0];
1214
1215
1216         wxUint16 cc[2];
1217
1218         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1219         if (pa == (size_t)-1)
1220             return pa;
1221
1222         if (buf)
1223         {
1224             *buf++ = cc[0];
1225             if (pa > 1)
1226                 *buf++ = cc[1];
1227         }
1228         len += pa;
1229         psz += sizeof(wxUint32);
1230     }
1231
1232     if (buf && len<n)
1233         *buf=0;
1234
1235     return len;
1236 }
1237
1238
1239 // swap 16bit String to 32bit MB
1240 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1241 {
1242     size_t len=0;
1243
1244     while (*psz && (!buf || len < n))
1245     {
1246         char cc[4];
1247
1248         // cast is ok for WC_UTF16
1249         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1250         if (pa == (size_t)-1)
1251             return pa;
1252
1253         if (buf)
1254         {
1255             *buf++ = cc[3];
1256             *buf++ = cc[2];
1257             *buf++ = cc[1];
1258             *buf++ = cc[0];
1259         }
1260         len += sizeof(wxUint32);
1261         psz += pa;
1262     }
1263
1264     if (buf && len<=n-sizeof(wxUint32))
1265         *(wxUint32*)buf=0;
1266
1267     return len;
1268 }
1269
1270 #else // WC_UTF16
1271
1272
1273 // copy 32bit MB to 32bit String
1274 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1275 {
1276     size_t len=0;
1277
1278     while (*(wxUint32*)psz && (!buf || len < n))
1279     {
1280         if (buf)
1281             *buf++ = (wchar_t)(*(wxUint32*)psz);
1282         len++;
1283         psz += sizeof(wxUint32);
1284     }
1285
1286     if (buf && len<n)
1287         *buf=0;
1288
1289     return len;
1290 }
1291
1292
1293 // copy 32bit String to 32bit MB
1294 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1295 {
1296     size_t len=0;
1297
1298     while (*psz && (!buf || len < n))
1299     {
1300         if (buf)
1301         {
1302             *(wxUint32*)buf = *psz;
1303             buf += sizeof(wxUint32);
1304         }
1305
1306         len += sizeof(wxUint32);
1307         psz++;
1308     }
1309
1310     if (buf && len<=n-sizeof(wxUint32))
1311         *(wxUint32*)buf=0;
1312
1313     return len;
1314 }
1315
1316
1317 // swap 32bit MB to 32bit String
1318 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1319 {
1320     size_t len=0;
1321
1322     while (*(wxUint32*)psz && (!buf || len < n))
1323     {
1324         if (buf)
1325         {
1326             ((char *)buf)[0] = psz[3];
1327             ((char *)buf)[1] = psz[2];
1328             ((char *)buf)[2] = psz[1];
1329             ((char *)buf)[3] = psz[0];
1330             buf++;
1331         }
1332         len++;
1333         psz += sizeof(wxUint32);
1334     }
1335
1336     if (buf && len<n)
1337         *buf=0;
1338
1339     return len;
1340 }
1341
1342
1343 // swap 32bit String to 32bit MB
1344 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1345 {
1346     size_t len=0;
1347
1348     while (*psz && (!buf || len < n))
1349     {
1350         if (buf)
1351         {
1352             *buf++ = ((char *)psz)[3];
1353             *buf++ = ((char *)psz)[2];
1354             *buf++ = ((char *)psz)[1];
1355             *buf++ = ((char *)psz)[0];
1356         }
1357         len += sizeof(wxUint32);
1358         psz++;
1359     }
1360
1361     if (buf && len<=n-sizeof(wxUint32))
1362         *(wxUint32*)buf=0;
1363
1364     return len;
1365 }
1366
1367
1368 #endif // WC_UTF16
1369
1370
1371 // ============================================================================
1372 // The classes doing conversion using the iconv_xxx() functions
1373 // ============================================================================
1374
1375 #ifdef HAVE_ICONV
1376
1377 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1378 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1379 //     (unless there's yet another bug in glibc) the only case when iconv()
1380 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1381 //     left in the input buffer -- when _real_ error occurs,
1382 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1383 //     iconv() failure.
1384 //     [This bug does not appear in glibc 2.2.]
1385 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1386 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1387                                      (errno != E2BIG || bufLeft != 0))
1388 #else
1389 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1390 #endif
1391
1392 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1393
1394 #define ICONV_T_INVALID ((iconv_t)-1)
1395
1396 #if SIZEOF_WCHAR_T == 4
1397     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1398     #define WC_ENC      wxFONTENCODING_UTF32
1399 #elif SIZEOF_WCHAR_T == 2
1400     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1401     #define WC_ENC      wxFONTENCODING_UTF16
1402 #else // sizeof(wchar_t) != 2 nor 4
1403     // does this ever happen?
1404     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1405 #endif
1406
1407 // ----------------------------------------------------------------------------
1408 // wxMBConv_iconv: encapsulates an iconv character set
1409 // ----------------------------------------------------------------------------
1410
1411 class wxMBConv_iconv : public wxMBConv
1412 {
1413 public:
1414     wxMBConv_iconv(const wxChar *name);
1415     virtual ~wxMBConv_iconv();
1416
1417     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1418     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1419
1420     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1421     virtual size_t GetMBNulLen() const;
1422
1423     virtual wxMBConv *Clone() const
1424     {
1425         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1426         p->m_minMBCharWidth = m_minMBCharWidth;
1427         return p;
1428     }
1429
1430     bool IsOk() const
1431         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1432
1433 protected:
1434     // the iconv handlers used to translate from multibyte to wide char and in
1435     // the other direction
1436     iconv_t m2w,
1437             w2m;
1438 #if wxUSE_THREADS
1439     // guards access to m2w and w2m objects
1440     wxMutex m_iconvMutex;
1441 #endif
1442
1443 private:
1444     // the name (for iconv_open()) of a wide char charset -- if none is
1445     // available on this machine, it will remain NULL
1446     static wxString ms_wcCharsetName;
1447
1448     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1449     // different endian-ness than the native one
1450     static bool ms_wcNeedsSwap;
1451
1452
1453     // name of the encoding handled by this conversion
1454     wxString m_name;
1455
1456     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1457     // initially
1458     size_t m_minMBCharWidth;
1459 };
1460
1461 // make the constructor available for unit testing
1462 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1463 {
1464     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1465     if ( !result->IsOk() )
1466     {
1467         delete result;
1468         return 0;
1469     }
1470     return result;
1471 }
1472
1473 wxString wxMBConv_iconv::ms_wcCharsetName;
1474 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1475
1476 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1477               : m_name(name)
1478 {
1479     m_minMBCharWidth = 0;
1480
1481     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1482     // names for the charsets
1483     const wxCharBuffer cname(wxString(name).ToAscii());
1484
1485     // check for charset that represents wchar_t:
1486     if ( ms_wcCharsetName.empty() )
1487     {
1488         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1489
1490 #if wxUSE_FONTMAP
1491         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1492 #else // !wxUSE_FONTMAP
1493         static const wxChar *names[] =
1494         {
1495 #if SIZEOF_WCHAR_T == 4
1496             _T("UCS-4"),
1497 #elif SIZEOF_WCHAR_T = 2
1498             _T("UCS-2"),
1499 #endif
1500             NULL
1501         };
1502 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1503
1504         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1505         {
1506             const wxString nameCS(*names);
1507
1508             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1509             wxString nameXE(nameCS);
1510             #ifdef WORDS_BIGENDIAN
1511                 nameXE += _T("BE");
1512             #else // little endian
1513                 nameXE += _T("LE");
1514             #endif
1515
1516             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1517                        nameXE.c_str());
1518
1519             m2w = iconv_open(nameXE.ToAscii(), cname);
1520             if ( m2w == ICONV_T_INVALID )
1521             {
1522                 // try charset w/o bytesex info (e.g. "UCS4")
1523                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1524                            nameCS.c_str());
1525                 m2w = iconv_open(nameCS.ToAscii(), cname);
1526
1527                 // and check for bytesex ourselves:
1528                 if ( m2w != ICONV_T_INVALID )
1529                 {
1530                     char    buf[2], *bufPtr;
1531                     wchar_t wbuf[2], *wbufPtr;
1532                     size_t  insz, outsz;
1533                     size_t  res;
1534
1535                     buf[0] = 'A';
1536                     buf[1] = 0;
1537                     wbuf[0] = 0;
1538                     insz = 2;
1539                     outsz = SIZEOF_WCHAR_T * 2;
1540                     wbufPtr = wbuf;
1541                     bufPtr = buf;
1542
1543                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1544                                 (char**)&wbufPtr, &outsz);
1545
1546                     if (ICONV_FAILED(res, insz))
1547                     {
1548                         wxLogLastError(wxT("iconv"));
1549                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1550                                    nameCS.c_str());
1551                     }
1552                     else // ok, can convert to this encoding, remember it
1553                     {
1554                         ms_wcCharsetName = nameCS;
1555                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1556                     }
1557                 }
1558             }
1559             else // use charset not requiring byte swapping
1560             {
1561                 ms_wcCharsetName = nameXE;
1562             }
1563         }
1564
1565         wxLogTrace(TRACE_STRCONV,
1566                    wxT("iconv wchar_t charset is \"%s\"%s"),
1567                    ms_wcCharsetName.empty() ? _T("<none>")
1568                                             : ms_wcCharsetName.c_str(),
1569                    ms_wcNeedsSwap ? _T(" (needs swap)")
1570                                   : _T(""));
1571     }
1572     else // we already have ms_wcCharsetName
1573     {
1574         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1575     }
1576
1577     if ( ms_wcCharsetName.empty() )
1578     {
1579         w2m = ICONV_T_INVALID;
1580     }
1581     else
1582     {
1583         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1584         if ( w2m == ICONV_T_INVALID )
1585         {
1586             wxLogTrace(TRACE_STRCONV,
1587                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1588                        ms_wcCharsetName.c_str(), cname.data());
1589         }
1590     }
1591 }
1592
1593 wxMBConv_iconv::~wxMBConv_iconv()
1594 {
1595     if ( m2w != ICONV_T_INVALID )
1596         iconv_close(m2w);
1597     if ( w2m != ICONV_T_INVALID )
1598         iconv_close(w2m);
1599 }
1600
1601 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1602 {
1603     // find the string length: notice that must be done differently for
1604     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1605     size_t inbuf;
1606     const size_t nulLen = GetMBNulLen();
1607     switch ( nulLen )
1608     {
1609         default:
1610             return (size_t)-1;
1611
1612         case 1:
1613             inbuf = strlen(psz); // arguably more optimized than our version
1614             break;
1615
1616         case 2:
1617         case 4:
1618             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1619             // they also have to start at character boundary and not span two
1620             // adjacent characters
1621             const char *p;
1622             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1623                 ;
1624             inbuf = p - psz;
1625             break;
1626     }
1627
1628 #if wxUSE_THREADS
1629     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1630     //     Unfortunately there is a couple of global wxCSConv objects such as
1631     //     wxConvLocal that are used all over wx code, so we have to make sure
1632     //     the handle is used by at most one thread at the time. Otherwise
1633     //     only a few wx classes would be safe to use from non-main threads
1634     //     as MB<->WC conversion would fail "randomly".
1635     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1636 #endif // wxUSE_THREADS
1637
1638
1639     size_t outbuf = n * SIZEOF_WCHAR_T;
1640     size_t res, cres;
1641     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1642     wchar_t *bufPtr = buf;
1643     const char *pszPtr = psz;
1644
1645     if (buf)
1646     {
1647         // have destination buffer, convert there
1648         cres = iconv(m2w,
1649                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1650                      (char**)&bufPtr, &outbuf);
1651         res = n - (outbuf / SIZEOF_WCHAR_T);
1652
1653         if (ms_wcNeedsSwap)
1654         {
1655             // convert to native endianness
1656             for ( unsigned i = 0; i < res; i++ )
1657                 buf[n] = WC_BSWAP(buf[i]);
1658         }
1659
1660         // NUL-terminate the string if there is any space left
1661         if (res < n)
1662             buf[res] = 0;
1663     }
1664     else
1665     {
1666         // no destination buffer... convert using temp buffer
1667         // to calculate destination buffer requirement
1668         wchar_t tbuf[8];
1669         res = 0;
1670         do {
1671             bufPtr = tbuf;
1672             outbuf = 8*SIZEOF_WCHAR_T;
1673
1674             cres = iconv(m2w,
1675                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1676                          (char**)&bufPtr, &outbuf );
1677
1678             res += 8-(outbuf/SIZEOF_WCHAR_T);
1679         } while ((cres==(size_t)-1) && (errno==E2BIG));
1680     }
1681
1682     if (ICONV_FAILED(cres, inbuf))
1683     {
1684         //VS: it is ok if iconv fails, hence trace only
1685         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1686         return (size_t)-1;
1687     }
1688
1689     return res;
1690 }
1691
1692 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1693 {
1694 #if wxUSE_THREADS
1695     // NB: explained in MB2WC
1696     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1697 #endif
1698
1699     size_t inlen = wxWcslen(psz);
1700     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1701     size_t outbuf = n;
1702     size_t res, cres;
1703
1704     wchar_t *tmpbuf = 0;
1705
1706     if (ms_wcNeedsSwap)
1707     {
1708         // need to copy to temp buffer to switch endianness
1709         // (doing WC_BSWAP twice on the original buffer won't help, as it
1710         //  could be in read-only memory, or be accessed in some other thread)
1711         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1712         for ( size_t i = 0; i < inlen; i++ )
1713             tmpbuf[n] = WC_BSWAP(psz[i]);
1714         tmpbuf[inlen] = L'\0';
1715         psz = tmpbuf;
1716     }
1717
1718     if (buf)
1719     {
1720         // have destination buffer, convert there
1721         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1722
1723         res = n-outbuf;
1724
1725         // NB: iconv was given only wcslen(psz) characters on input, and so
1726         //     it couldn't convert the trailing zero. Let's do it ourselves
1727         //     if there's some room left for it in the output buffer.
1728         if (res < n)
1729             buf[0] = 0;
1730     }
1731     else
1732     {
1733         // no destination buffer... convert using temp buffer
1734         // to calculate destination buffer requirement
1735         char tbuf[16];
1736         res = 0;
1737         do {
1738             buf = tbuf; outbuf = 16;
1739
1740             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1741
1742             res += 16 - outbuf;
1743         } while ((cres==(size_t)-1) && (errno==E2BIG));
1744     }
1745
1746     if (ms_wcNeedsSwap)
1747     {
1748         free(tmpbuf);
1749     }
1750
1751     if (ICONV_FAILED(cres, inbuf))
1752     {
1753         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1754         return (size_t)-1;
1755     }
1756
1757     return res;
1758 }
1759
1760 size_t wxMBConv_iconv::GetMBNulLen() const
1761 {
1762     if ( m_minMBCharWidth == 0 )
1763     {
1764         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1765
1766 #if wxUSE_THREADS
1767         // NB: explained in MB2WC
1768         wxMutexLocker lock(self->m_iconvMutex);
1769 #endif
1770
1771         wchar_t *wnul = L"";
1772         char buf[8]; // should be enough for NUL in any encoding
1773         size_t inLen = sizeof(wchar_t),
1774                outLen = WXSIZEOF(buf);
1775         char *in = (char *)wnul;
1776         char *out = buf;
1777         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1778         {
1779             self->m_minMBCharWidth = (size_t)-1;
1780         }
1781         else // ok
1782         {
1783             self->m_minMBCharWidth = out - buf;
1784         }
1785     }
1786
1787     return m_minMBCharWidth;
1788 }
1789
1790 #endif // HAVE_ICONV
1791
1792
1793 // ============================================================================
1794 // Win32 conversion classes
1795 // ============================================================================
1796
1797 #ifdef wxHAVE_WIN32_MB2WC
1798
1799 // from utils.cpp
1800 #if wxUSE_FONTMAP
1801 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1802 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1803 #endif
1804
1805 class wxMBConv_win32 : public wxMBConv
1806 {
1807 public:
1808     wxMBConv_win32()
1809     {
1810         m_CodePage = CP_ACP;
1811         m_minMBCharWidth = 0;
1812     }
1813
1814     wxMBConv_win32(const wxMBConv_win32& conv)
1815     {
1816         m_CodePage = conv.m_CodePage;
1817         m_minMBCharWidth = conv.m_minMBCharWidth;
1818     }
1819
1820 #if wxUSE_FONTMAP
1821     wxMBConv_win32(const wxChar* name)
1822     {
1823         m_CodePage = wxCharsetToCodepage(name);
1824         m_minMBCharWidth = 0;
1825     }
1826
1827     wxMBConv_win32(wxFontEncoding encoding)
1828     {
1829         m_CodePage = wxEncodingToCodepage(encoding);
1830         m_minMBCharWidth = 0;
1831     }
1832 #endif // wxUSE_FONTMAP
1833
1834     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1835     {
1836         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1837         // the behaviour is not compatible with the Unix version (using iconv)
1838         // and break the library itself, e.g. wxTextInputStream::NextChar()
1839         // wouldn't work if reading an incomplete MB char didn't result in an
1840         // error
1841         //
1842         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1843         // Win XP or newer and it is not supported for UTF-[78] so we always
1844         // use our own conversions in this case. See
1845         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1846         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1847         if ( m_CodePage == CP_UTF8 )
1848         {
1849             return wxConvUTF8.MB2WC(buf, psz, n);
1850         }
1851
1852         if ( m_CodePage == CP_UTF7 )
1853         {
1854             return wxConvUTF7.MB2WC(buf, psz, n);
1855         }
1856
1857         int flags = 0;
1858         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1859                 IsAtLeastWin2kSP4() )
1860         {
1861             flags = MB_ERR_INVALID_CHARS;
1862         }
1863
1864         const size_t len = ::MultiByteToWideChar
1865                              (
1866                                 m_CodePage,     // code page
1867                                 flags,          // flags: fall on error
1868                                 psz,            // input string
1869                                 -1,             // its length (NUL-terminated)
1870                                 buf,            // output string
1871                                 buf ? n : 0     // size of output buffer
1872                              );
1873         if ( !len )
1874         {
1875             // function totally failed
1876             return (size_t)-1;
1877         }
1878
1879         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1880         // check if we succeeded, by doing a double trip:
1881         if ( !flags && buf )
1882         {
1883             const size_t mbLen = strlen(psz);
1884             wxCharBuffer mbBuf(mbLen);
1885             if ( ::WideCharToMultiByte
1886                    (
1887                       m_CodePage,
1888                       0,
1889                       buf,
1890                       -1,
1891                       mbBuf.data(),
1892                       mbLen + 1,        // size in bytes, not length
1893                       NULL,
1894                       NULL
1895                    ) == 0 ||
1896                   strcmp(mbBuf, psz) != 0 )
1897             {
1898                 // we didn't obtain the same thing we started from, hence
1899                 // the conversion was lossy and we consider that it failed
1900                 return (size_t)-1;
1901             }
1902         }
1903
1904         // note that it returns count of written chars for buf != NULL and size
1905         // of the needed buffer for buf == NULL so in either case the length of
1906         // the string (which never includes the terminating NUL) is one less
1907         return len - 1;
1908     }
1909
1910     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1911     {
1912         /*
1913             we have a problem here: by default, WideCharToMultiByte() may
1914             replace characters unrepresentable in the target code page with bad
1915             quality approximations such as turning "1/2" symbol (U+00BD) into
1916             "1" for the code pages which don't have it and we, obviously, want
1917             to avoid this at any price
1918
1919             the trouble is that this function does it _silently_, i.e. it won't
1920             even tell us whether it did or not... Win98/2000 and higher provide
1921             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1922             we have to resort to a round trip, i.e. check that converting back
1923             results in the same string -- this is, of course, expensive but
1924             otherwise we simply can't be sure to not garble the data.
1925          */
1926
1927         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1928         // it doesn't work with CJK encodings (which we test for rather roughly
1929         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1930         // supporting it
1931         BOOL usedDef wxDUMMY_INITIALIZE(false);
1932         BOOL *pUsedDef;
1933         int flags;
1934         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1935         {
1936             // it's our lucky day
1937             flags = WC_NO_BEST_FIT_CHARS;
1938             pUsedDef = &usedDef;
1939         }
1940         else // old system or unsupported encoding
1941         {
1942             flags = 0;
1943             pUsedDef = NULL;
1944         }
1945
1946         const size_t len = ::WideCharToMultiByte
1947                              (
1948                                 m_CodePage,     // code page
1949                                 flags,          // either none or no best fit
1950                                 pwz,            // input string
1951                                 -1,             // it is (wide) NUL-terminated
1952                                 buf,            // output buffer
1953                                 buf ? n : 0,    // and its size
1954                                 NULL,           // default "replacement" char
1955                                 pUsedDef        // [out] was it used?
1956                              );
1957
1958         if ( !len )
1959         {
1960             // function totally failed
1961             return (size_t)-1;
1962         }
1963
1964         // if we were really converting, check if we succeeded
1965         if ( buf )
1966         {
1967             if ( flags )
1968             {
1969                 // check if the conversion failed, i.e. if any replacements
1970                 // were done
1971                 if ( usedDef )
1972                     return (size_t)-1;
1973             }
1974             else // we must resort to double tripping...
1975             {
1976                 wxWCharBuffer wcBuf(n);
1977                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1978                         wcscmp(wcBuf, pwz) != 0 )
1979                 {
1980                     // we didn't obtain the same thing we started from, hence
1981                     // the conversion was lossy and we consider that it failed
1982                     return (size_t)-1;
1983                 }
1984             }
1985         }
1986
1987         // see the comment above for the reason of "len - 1"
1988         return len - 1;
1989     }
1990
1991     virtual size_t GetMBNulLen() const
1992     {
1993         if ( m_minMBCharWidth == 0 )
1994         {
1995             int len = ::WideCharToMultiByte
1996                         (
1997                             m_CodePage,     // code page
1998                             0,              // no flags
1999                             L"",            // input string
2000                             1,              // translate just the NUL
2001                             NULL,           // output buffer
2002                             0,              // and its size
2003                             NULL,           // no replacement char
2004                             NULL            // [out] don't care if it was used
2005                         );
2006
2007             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2008             switch ( len )
2009             {
2010                 default:
2011                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2012                     // fall through
2013
2014                 case 0:
2015                     self->m_minMBCharWidth = (size_t)-1;
2016                     break;
2017
2018                 case 1:
2019                 case 2:
2020                 case 4:
2021                     self->m_minMBCharWidth = len;
2022                     break;
2023             }
2024         }
2025
2026         return m_minMBCharWidth;
2027     }
2028
2029     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2030
2031     bool IsOk() const { return m_CodePage != -1; }
2032
2033 private:
2034     static bool CanUseNoBestFit()
2035     {
2036         static int s_isWin98Or2k = -1;
2037
2038         if ( s_isWin98Or2k == -1 )
2039         {
2040             int verMaj, verMin;
2041             switch ( wxGetOsVersion(&verMaj, &verMin) )
2042             {
2043                 case wxWIN95:
2044                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2045                     break;
2046
2047                 case wxWINDOWS_NT:
2048                     s_isWin98Or2k = verMaj >= 5;
2049                     break;
2050
2051                 default:
2052                     // unknown, be conseravtive by default
2053                     s_isWin98Or2k = 0;
2054             }
2055
2056             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2057         }
2058
2059         return s_isWin98Or2k == 1;
2060     }
2061
2062     static bool IsAtLeastWin2kSP4()
2063     {
2064 #ifdef __WXWINCE__
2065         return false;
2066 #else
2067         static int s_isAtLeastWin2kSP4 = -1;
2068
2069         if ( s_isAtLeastWin2kSP4 == -1 )
2070         {
2071             OSVERSIONINFOEX ver;
2072
2073             memset(&ver, 0, sizeof(ver));
2074             ver.dwOSVersionInfoSize = sizeof(ver);
2075             GetVersionEx((OSVERSIONINFO*)&ver);
2076
2077             s_isAtLeastWin2kSP4 =
2078               ((ver.dwMajorVersion > 5) || // Vista+
2079                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2080                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2081                ver.wServicePackMajor >= 4)) // 2000 SP4+
2082               ? 1 : 0;
2083         }
2084
2085         return s_isAtLeastWin2kSP4 == 1;
2086 #endif
2087     }
2088
2089
2090     // the code page we're working with
2091     long m_CodePage;
2092
2093     // cached result of GetMBNulLen(), set to 0 initially meaning
2094     // "unknown"
2095     size_t m_minMBCharWidth;
2096 };
2097
2098 #endif // wxHAVE_WIN32_MB2WC
2099
2100 // ============================================================================
2101 // Cocoa conversion classes
2102 // ============================================================================
2103
2104 #if defined(__WXCOCOA__)
2105
2106 // RN:  There is no UTF-32 support in either Core Foundation or
2107 // Cocoa.  Strangely enough, internally Core Foundation uses
2108 // UTF 32 internally quite a bit - its just not public (yet).
2109
2110 #include <CoreFoundation/CFString.h>
2111 #include <CoreFoundation/CFStringEncodingExt.h>
2112
2113 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2114 {
2115     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2116     if ( encoding == wxFONTENCODING_DEFAULT )
2117     {
2118         enc = CFStringGetSystemEncoding();
2119     }
2120     else switch( encoding)
2121     {
2122         case wxFONTENCODING_ISO8859_1 :
2123             enc = kCFStringEncodingISOLatin1 ;
2124             break ;
2125         case wxFONTENCODING_ISO8859_2 :
2126             enc = kCFStringEncodingISOLatin2;
2127             break ;
2128         case wxFONTENCODING_ISO8859_3 :
2129             enc = kCFStringEncodingISOLatin3 ;
2130             break ;
2131         case wxFONTENCODING_ISO8859_4 :
2132             enc = kCFStringEncodingISOLatin4;
2133             break ;
2134         case wxFONTENCODING_ISO8859_5 :
2135             enc = kCFStringEncodingISOLatinCyrillic;
2136             break ;
2137         case wxFONTENCODING_ISO8859_6 :
2138             enc = kCFStringEncodingISOLatinArabic;
2139             break ;
2140         case wxFONTENCODING_ISO8859_7 :
2141             enc = kCFStringEncodingISOLatinGreek;
2142             break ;
2143         case wxFONTENCODING_ISO8859_8 :
2144             enc = kCFStringEncodingISOLatinHebrew;
2145             break ;
2146         case wxFONTENCODING_ISO8859_9 :
2147             enc = kCFStringEncodingISOLatin5;
2148             break ;
2149         case wxFONTENCODING_ISO8859_10 :
2150             enc = kCFStringEncodingISOLatin6;
2151             break ;
2152         case wxFONTENCODING_ISO8859_11 :
2153             enc = kCFStringEncodingISOLatinThai;
2154             break ;
2155         case wxFONTENCODING_ISO8859_13 :
2156             enc = kCFStringEncodingISOLatin7;
2157             break ;
2158         case wxFONTENCODING_ISO8859_14 :
2159             enc = kCFStringEncodingISOLatin8;
2160             break ;
2161         case wxFONTENCODING_ISO8859_15 :
2162             enc = kCFStringEncodingISOLatin9;
2163             break ;
2164
2165         case wxFONTENCODING_KOI8 :
2166             enc = kCFStringEncodingKOI8_R;
2167             break ;
2168         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2169             enc = kCFStringEncodingDOSRussian;
2170             break ;
2171
2172 //      case wxFONTENCODING_BULGARIAN :
2173 //          enc = ;
2174 //          break ;
2175
2176         case wxFONTENCODING_CP437 :
2177             enc =kCFStringEncodingDOSLatinUS ;
2178             break ;
2179         case wxFONTENCODING_CP850 :
2180             enc = kCFStringEncodingDOSLatin1;
2181             break ;
2182         case wxFONTENCODING_CP852 :
2183             enc = kCFStringEncodingDOSLatin2;
2184             break ;
2185         case wxFONTENCODING_CP855 :
2186             enc = kCFStringEncodingDOSCyrillic;
2187             break ;
2188         case wxFONTENCODING_CP866 :
2189             enc =kCFStringEncodingDOSRussian ;
2190             break ;
2191         case wxFONTENCODING_CP874 :
2192             enc = kCFStringEncodingDOSThai;
2193             break ;
2194         case wxFONTENCODING_CP932 :
2195             enc = kCFStringEncodingDOSJapanese;
2196             break ;
2197         case wxFONTENCODING_CP936 :
2198             enc =kCFStringEncodingDOSChineseSimplif ;
2199             break ;
2200         case wxFONTENCODING_CP949 :
2201             enc = kCFStringEncodingDOSKorean;
2202             break ;
2203         case wxFONTENCODING_CP950 :
2204             enc = kCFStringEncodingDOSChineseTrad;
2205             break ;
2206         case wxFONTENCODING_CP1250 :
2207             enc = kCFStringEncodingWindowsLatin2;
2208             break ;
2209         case wxFONTENCODING_CP1251 :
2210             enc =kCFStringEncodingWindowsCyrillic ;
2211             break ;
2212         case wxFONTENCODING_CP1252 :
2213             enc =kCFStringEncodingWindowsLatin1 ;
2214             break ;
2215         case wxFONTENCODING_CP1253 :
2216             enc = kCFStringEncodingWindowsGreek;
2217             break ;
2218         case wxFONTENCODING_CP1254 :
2219             enc = kCFStringEncodingWindowsLatin5;
2220             break ;
2221         case wxFONTENCODING_CP1255 :
2222             enc =kCFStringEncodingWindowsHebrew ;
2223             break ;
2224         case wxFONTENCODING_CP1256 :
2225             enc =kCFStringEncodingWindowsArabic ;
2226             break ;
2227         case wxFONTENCODING_CP1257 :
2228             enc = kCFStringEncodingWindowsBalticRim;
2229             break ;
2230 //   This only really encodes to UTF7 (if that) evidently
2231 //        case wxFONTENCODING_UTF7 :
2232 //            enc = kCFStringEncodingNonLossyASCII ;
2233 //            break ;
2234         case wxFONTENCODING_UTF8 :
2235             enc = kCFStringEncodingUTF8 ;
2236             break ;
2237         case wxFONTENCODING_EUC_JP :
2238             enc = kCFStringEncodingEUC_JP;
2239             break ;
2240         case wxFONTENCODING_UTF16 :
2241             enc = kCFStringEncodingUnicode ;
2242             break ;
2243         case wxFONTENCODING_MACROMAN :
2244             enc = kCFStringEncodingMacRoman ;
2245             break ;
2246         case wxFONTENCODING_MACJAPANESE :
2247             enc = kCFStringEncodingMacJapanese ;
2248             break ;
2249         case wxFONTENCODING_MACCHINESETRAD :
2250             enc = kCFStringEncodingMacChineseTrad ;
2251             break ;
2252         case wxFONTENCODING_MACKOREAN :
2253             enc = kCFStringEncodingMacKorean ;
2254             break ;
2255         case wxFONTENCODING_MACARABIC :
2256             enc = kCFStringEncodingMacArabic ;
2257             break ;
2258         case wxFONTENCODING_MACHEBREW :
2259             enc = kCFStringEncodingMacHebrew ;
2260             break ;
2261         case wxFONTENCODING_MACGREEK :
2262             enc = kCFStringEncodingMacGreek ;
2263             break ;
2264         case wxFONTENCODING_MACCYRILLIC :
2265             enc = kCFStringEncodingMacCyrillic ;
2266             break ;
2267         case wxFONTENCODING_MACDEVANAGARI :
2268             enc = kCFStringEncodingMacDevanagari ;
2269             break ;
2270         case wxFONTENCODING_MACGURMUKHI :
2271             enc = kCFStringEncodingMacGurmukhi ;
2272             break ;
2273         case wxFONTENCODING_MACGUJARATI :
2274             enc = kCFStringEncodingMacGujarati ;
2275             break ;
2276         case wxFONTENCODING_MACORIYA :
2277             enc = kCFStringEncodingMacOriya ;
2278             break ;
2279         case wxFONTENCODING_MACBENGALI :
2280             enc = kCFStringEncodingMacBengali ;
2281             break ;
2282         case wxFONTENCODING_MACTAMIL :
2283             enc = kCFStringEncodingMacTamil ;
2284             break ;
2285         case wxFONTENCODING_MACTELUGU :
2286             enc = kCFStringEncodingMacTelugu ;
2287             break ;
2288         case wxFONTENCODING_MACKANNADA :
2289             enc = kCFStringEncodingMacKannada ;
2290             break ;
2291         case wxFONTENCODING_MACMALAJALAM :
2292             enc = kCFStringEncodingMacMalayalam ;
2293             break ;
2294         case wxFONTENCODING_MACSINHALESE :
2295             enc = kCFStringEncodingMacSinhalese ;
2296             break ;
2297         case wxFONTENCODING_MACBURMESE :
2298             enc = kCFStringEncodingMacBurmese ;
2299             break ;
2300         case wxFONTENCODING_MACKHMER :
2301             enc = kCFStringEncodingMacKhmer ;
2302             break ;
2303         case wxFONTENCODING_MACTHAI :
2304             enc = kCFStringEncodingMacThai ;
2305             break ;
2306         case wxFONTENCODING_MACLAOTIAN :
2307             enc = kCFStringEncodingMacLaotian ;
2308             break ;
2309         case wxFONTENCODING_MACGEORGIAN :
2310             enc = kCFStringEncodingMacGeorgian ;
2311             break ;
2312         case wxFONTENCODING_MACARMENIAN :
2313             enc = kCFStringEncodingMacArmenian ;
2314             break ;
2315         case wxFONTENCODING_MACCHINESESIMP :
2316             enc = kCFStringEncodingMacChineseSimp ;
2317             break ;
2318         case wxFONTENCODING_MACTIBETAN :
2319             enc = kCFStringEncodingMacTibetan ;
2320             break ;
2321         case wxFONTENCODING_MACMONGOLIAN :
2322             enc = kCFStringEncodingMacMongolian ;
2323             break ;
2324         case wxFONTENCODING_MACETHIOPIC :
2325             enc = kCFStringEncodingMacEthiopic ;
2326             break ;
2327         case wxFONTENCODING_MACCENTRALEUR :
2328             enc = kCFStringEncodingMacCentralEurRoman ;
2329             break ;
2330         case wxFONTENCODING_MACVIATNAMESE :
2331             enc = kCFStringEncodingMacVietnamese ;
2332             break ;
2333         case wxFONTENCODING_MACARABICEXT :
2334             enc = kCFStringEncodingMacExtArabic ;
2335             break ;
2336         case wxFONTENCODING_MACSYMBOL :
2337             enc = kCFStringEncodingMacSymbol ;
2338             break ;
2339         case wxFONTENCODING_MACDINGBATS :
2340             enc = kCFStringEncodingMacDingbats ;
2341             break ;
2342         case wxFONTENCODING_MACTURKISH :
2343             enc = kCFStringEncodingMacTurkish ;
2344             break ;
2345         case wxFONTENCODING_MACCROATIAN :
2346             enc = kCFStringEncodingMacCroatian ;
2347             break ;
2348         case wxFONTENCODING_MACICELANDIC :
2349             enc = kCFStringEncodingMacIcelandic ;
2350             break ;
2351         case wxFONTENCODING_MACROMANIAN :
2352             enc = kCFStringEncodingMacRomanian ;
2353             break ;
2354         case wxFONTENCODING_MACCELTIC :
2355             enc = kCFStringEncodingMacCeltic ;
2356             break ;
2357         case wxFONTENCODING_MACGAELIC :
2358             enc = kCFStringEncodingMacGaelic ;
2359             break ;
2360 //      case wxFONTENCODING_MACKEYBOARD :
2361 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2362 //          break ;
2363         default :
2364             // because gcc is picky
2365             break ;
2366     } ;
2367     return enc ;
2368 }
2369
2370 class wxMBConv_cocoa : public wxMBConv
2371 {
2372 public:
2373     wxMBConv_cocoa()
2374     {
2375         Init(CFStringGetSystemEncoding()) ;
2376     }
2377
2378     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2379     {
2380         m_encoding = conv.m_encoding;
2381     }
2382
2383 #if wxUSE_FONTMAP
2384     wxMBConv_cocoa(const wxChar* name)
2385     {
2386         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2387     }
2388 #endif
2389
2390     wxMBConv_cocoa(wxFontEncoding encoding)
2391     {
2392         Init( wxCFStringEncFromFontEnc(encoding) );
2393     }
2394
2395     ~wxMBConv_cocoa()
2396     {
2397     }
2398
2399     void Init( CFStringEncoding encoding)
2400     {
2401         m_encoding = encoding ;
2402     }
2403
2404     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2405     {
2406         wxASSERT(szUnConv);
2407
2408         CFStringRef theString = CFStringCreateWithBytes (
2409                                                 NULL, //the allocator
2410                                                 (const UInt8*)szUnConv,
2411                                                 strlen(szUnConv),
2412                                                 m_encoding,
2413                                                 false //no BOM/external representation
2414                                                 );
2415
2416         wxASSERT(theString);
2417
2418         size_t nOutLength = CFStringGetLength(theString);
2419
2420         if (szOut == NULL)
2421         {
2422             CFRelease(theString);
2423             return nOutLength;
2424         }
2425
2426         CFRange theRange = { 0, nOutSize };
2427
2428 #if SIZEOF_WCHAR_T == 4
2429         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2430 #endif
2431
2432         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2433
2434         CFRelease(theString);
2435
2436         szUniCharBuffer[nOutLength] = '\0' ;
2437
2438 #if SIZEOF_WCHAR_T == 4
2439         wxMBConvUTF16 converter ;
2440         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2441         delete[] szUniCharBuffer;
2442 #endif
2443
2444         return nOutLength;
2445     }
2446
2447     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2448     {
2449         wxASSERT(szUnConv);
2450
2451         size_t nRealOutSize;
2452         size_t nBufSize = wxWcslen(szUnConv);
2453         UniChar* szUniBuffer = (UniChar*) szUnConv;
2454
2455 #if SIZEOF_WCHAR_T == 4
2456         wxMBConvUTF16 converter ;
2457         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2458         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2459         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2460         nBufSize /= sizeof(UniChar);
2461 #endif
2462
2463         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2464                                 NULL, //allocator
2465                                 szUniBuffer,
2466                                 nBufSize,
2467                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2468                             );
2469
2470         wxASSERT(theString);
2471
2472         //Note that CER puts a BOM when converting to unicode
2473         //so we  check and use getchars instead in that case
2474         if (m_encoding == kCFStringEncodingUnicode)
2475         {
2476             if (szOut != NULL)
2477                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2478
2479             nRealOutSize = CFStringGetLength(theString) + 1;
2480         }
2481         else
2482         {
2483             CFStringGetBytes(
2484                 theString,
2485                 CFRangeMake(0, CFStringGetLength(theString)),
2486                 m_encoding,
2487                 0, //what to put in characters that can't be converted -
2488                     //0 tells CFString to return NULL if it meets such a character
2489                 false, //not an external representation
2490                 (UInt8*) szOut,
2491                 nOutSize,
2492                 (CFIndex*) &nRealOutSize
2493                         );
2494         }
2495
2496         CFRelease(theString);
2497
2498 #if SIZEOF_WCHAR_T == 4
2499         delete[] szUniBuffer;
2500 #endif
2501
2502         return  nRealOutSize - 1;
2503     }
2504
2505     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2506
2507     bool IsOk() const
2508     {
2509         return m_encoding != kCFStringEncodingInvalidId &&
2510               CFStringIsEncodingAvailable(m_encoding);
2511     }
2512
2513 private:
2514     CFStringEncoding m_encoding ;
2515 };
2516
2517 #endif // defined(__WXCOCOA__)
2518
2519 // ============================================================================
2520 // Mac conversion classes
2521 // ============================================================================
2522
2523 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2524
2525 class wxMBConv_mac : public wxMBConv
2526 {
2527 public:
2528     wxMBConv_mac()
2529     {
2530         Init(CFStringGetSystemEncoding()) ;
2531     }
2532
2533     wxMBConv_mac(const wxMBConv_mac& conv)
2534     {
2535         Init(conv.m_char_encoding);
2536     }
2537
2538 #if wxUSE_FONTMAP
2539     wxMBConv_mac(const wxChar* name)
2540     {
2541         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2542     }
2543 #endif
2544
2545     wxMBConv_mac(wxFontEncoding encoding)
2546     {
2547         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2548     }
2549
2550     ~wxMBConv_mac()
2551     {
2552         OSStatus status = noErr ;
2553         status = TECDisposeConverter(m_MB2WC_converter);
2554         status = TECDisposeConverter(m_WC2MB_converter);
2555     }
2556
2557
2558     void Init( TextEncodingBase encoding)
2559     {
2560         OSStatus status = noErr ;
2561         m_char_encoding = encoding ;
2562         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2563
2564         status = TECCreateConverter(&m_MB2WC_converter,
2565                                     m_char_encoding,
2566                                     m_unicode_encoding);
2567         status = TECCreateConverter(&m_WC2MB_converter,
2568                                     m_unicode_encoding,
2569                                     m_char_encoding);
2570     }
2571
2572     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2573     {
2574         OSStatus status = noErr ;
2575         ByteCount byteOutLen ;
2576         ByteCount byteInLen = strlen(psz) ;
2577         wchar_t *tbuf = NULL ;
2578         UniChar* ubuf = NULL ;
2579         size_t res = 0 ;
2580
2581         if (buf == NULL)
2582         {
2583             //apple specs say at least 32
2584             n = wxMax( 32 , byteInLen ) ;
2585             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2586         }
2587         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2588 #if SIZEOF_WCHAR_T == 4
2589         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2590 #else
2591         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2592 #endif
2593         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2594           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2595 #if SIZEOF_WCHAR_T == 4
2596         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2597         // is not properly terminated we get random characters at the end
2598         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2599         wxMBConvUTF16 converter ;
2600         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2601         free( ubuf ) ;
2602 #else
2603         res = byteOutLen / sizeof( UniChar ) ;
2604 #endif
2605         if ( buf == NULL )
2606              free(tbuf) ;
2607
2608         if ( buf  && res < n)
2609             buf[res] = 0;
2610
2611         return res ;
2612     }
2613
2614     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2615     {
2616         OSStatus status = noErr ;
2617         ByteCount byteOutLen ;
2618         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2619
2620         char *tbuf = NULL ;
2621
2622         if (buf == NULL)
2623         {
2624             //apple specs say at least 32
2625             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2626             tbuf = (char*) malloc( n ) ;
2627         }
2628
2629         ByteCount byteBufferLen = n ;
2630         UniChar* ubuf = NULL ;
2631 #if SIZEOF_WCHAR_T == 4
2632         wxMBConvUTF16 converter ;
2633         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2634         byteInLen = unicharlen ;
2635         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2636         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2637 #else
2638         ubuf = (UniChar*) psz ;
2639 #endif
2640         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2641             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2642 #if SIZEOF_WCHAR_T == 4
2643         free( ubuf ) ;
2644 #endif
2645         if ( buf == NULL )
2646             free(tbuf) ;
2647
2648         size_t res = byteOutLen ;
2649         if ( buf  && res < n)
2650         {
2651             buf[res] = 0;
2652
2653             //we need to double-trip to verify it didn't insert any ? in place
2654             //of bogus characters
2655             wxWCharBuffer wcBuf(n);
2656             size_t pszlen = wxWcslen(psz);
2657             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2658                         wxWcslen(wcBuf) != pszlen ||
2659                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2660             {
2661                 // we didn't obtain the same thing we started from, hence
2662                 // the conversion was lossy and we consider that it failed
2663                 return (size_t)-1;
2664             }
2665         }
2666
2667         return res ;
2668     }
2669
2670     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2671
2672     bool IsOk() const
2673         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2674
2675 private:
2676     TECObjectRef m_MB2WC_converter ;
2677     TECObjectRef m_WC2MB_converter ;
2678
2679     TextEncodingBase m_char_encoding ;
2680     TextEncodingBase m_unicode_encoding ;
2681 };
2682
2683 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2684
2685 // ============================================================================
2686 // wxEncodingConverter based conversion classes
2687 // ============================================================================
2688
2689 #if wxUSE_FONTMAP
2690
2691 class wxMBConv_wxwin : public wxMBConv
2692 {
2693 private:
2694     void Init()
2695     {
2696         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2697                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2698     }
2699
2700 public:
2701     // temporarily just use wxEncodingConverter stuff,
2702     // so that it works while a better implementation is built
2703     wxMBConv_wxwin(const wxChar* name)
2704     {
2705         if (name)
2706             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2707         else
2708             m_enc = wxFONTENCODING_SYSTEM;
2709
2710         Init();
2711     }
2712
2713     wxMBConv_wxwin(wxFontEncoding enc)
2714     {
2715         m_enc = enc;
2716
2717         Init();
2718     }
2719
2720     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2721     {
2722         size_t inbuf = strlen(psz);
2723         if (buf)
2724         {
2725             if (!m2w.Convert(psz,buf))
2726                 return (size_t)-1;
2727         }
2728         return inbuf;
2729     }
2730
2731     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2732     {
2733         const size_t inbuf = wxWcslen(psz);
2734         if (buf)
2735         {
2736             if (!w2m.Convert(psz,buf))
2737                 return (size_t)-1;
2738         }
2739
2740         return inbuf;
2741     }
2742
2743     virtual size_t GetMBNulLen() const
2744     {
2745         switch ( m_enc )
2746         {
2747             case wxFONTENCODING_UTF16BE:
2748             case wxFONTENCODING_UTF16LE:
2749                 return 2;
2750
2751             case wxFONTENCODING_UTF32BE:
2752             case wxFONTENCODING_UTF32LE:
2753                 return 4;
2754
2755             default:
2756                 return 1;
2757         }
2758     }
2759
2760     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2761
2762     bool IsOk() const { return m_ok; }
2763
2764 public:
2765     wxFontEncoding m_enc;
2766     wxEncodingConverter m2w, w2m;
2767
2768 private:
2769     // were we initialized successfully?
2770     bool m_ok;
2771
2772     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2773 };
2774
2775 // make the constructors available for unit testing
2776 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2777 {
2778     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2779     if ( !result->IsOk() )
2780     {
2781         delete result;
2782         return 0;
2783     }
2784     return result;
2785 }
2786
2787 #endif // wxUSE_FONTMAP
2788
2789 // ============================================================================
2790 // wxCSConv implementation
2791 // ============================================================================
2792
2793 void wxCSConv::Init()
2794 {
2795     m_name = NULL;
2796     m_convReal =  NULL;
2797     m_deferred = true;
2798 }
2799
2800 wxCSConv::wxCSConv(const wxChar *charset)
2801 {
2802     Init();
2803
2804     if ( charset )
2805     {
2806         SetName(charset);
2807     }
2808
2809 #if wxUSE_FONTMAP
2810     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2811 #else
2812     m_encoding = wxFONTENCODING_SYSTEM;
2813 #endif
2814 }
2815
2816 wxCSConv::wxCSConv(wxFontEncoding encoding)
2817 {
2818     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2819     {
2820         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2821
2822         encoding = wxFONTENCODING_SYSTEM;
2823     }
2824
2825     Init();
2826
2827     m_encoding = encoding;
2828 }
2829
2830 wxCSConv::~wxCSConv()
2831 {
2832     Clear();
2833 }
2834
2835 wxCSConv::wxCSConv(const wxCSConv& conv)
2836         : wxMBConv()
2837 {
2838     Init();
2839
2840     SetName(conv.m_name);
2841     m_encoding = conv.m_encoding;
2842 }
2843
2844 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2845 {
2846     Clear();
2847
2848     SetName(conv.m_name);
2849     m_encoding = conv.m_encoding;
2850
2851     return *this;
2852 }
2853
2854 void wxCSConv::Clear()
2855 {
2856     free(m_name);
2857     delete m_convReal;
2858
2859     m_name = NULL;
2860     m_convReal = NULL;
2861 }
2862
2863 void wxCSConv::SetName(const wxChar *charset)
2864 {
2865     if (charset)
2866     {
2867         m_name = wxStrdup(charset);
2868         m_deferred = true;
2869     }
2870 }
2871
2872 #if wxUSE_FONTMAP
2873 #include "wx/hashmap.h"
2874
2875 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2876                      wxEncodingNameCache );
2877
2878 static wxEncodingNameCache gs_nameCache;
2879 #endif
2880
2881 wxMBConv *wxCSConv::DoCreate() const
2882 {
2883 #if wxUSE_FONTMAP
2884     wxLogTrace(TRACE_STRCONV,
2885                wxT("creating conversion for %s"),
2886                (m_name ? m_name
2887                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2888 #endif // wxUSE_FONTMAP
2889
2890     // check for the special case of ASCII or ISO8859-1 charset: as we have
2891     // special knowledge of it anyhow, we don't need to create a special
2892     // conversion object
2893     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2894             m_encoding == wxFONTENCODING_DEFAULT )
2895     {
2896         // don't convert at all
2897         return NULL;
2898     }
2899
2900     // we trust OS to do conversion better than we can so try external
2901     // conversion methods first
2902     //
2903     // the full order is:
2904     //      1. OS conversion (iconv() under Unix or Win32 API)
2905     //      2. hard coded conversions for UTF
2906     //      3. wxEncodingConverter as fall back
2907
2908     // step (1)
2909 #ifdef HAVE_ICONV
2910 #if !wxUSE_FONTMAP
2911     if ( m_name )
2912 #endif // !wxUSE_FONTMAP
2913     {
2914         wxString name(m_name);
2915         wxFontEncoding encoding(m_encoding);
2916
2917         if ( !name.empty() )
2918         {
2919             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2920             if ( conv->IsOk() )
2921                 return conv;
2922
2923             delete conv;
2924
2925 #if wxUSE_FONTMAP
2926             encoding =
2927                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2928 #endif // wxUSE_FONTMAP
2929         }
2930 #if wxUSE_FONTMAP
2931         {
2932             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2933             if ( it != gs_nameCache.end() )
2934             {
2935                 if ( it->second.empty() )
2936                     return NULL;
2937
2938                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2939                 if ( conv->IsOk() )
2940                     return conv;
2941
2942                 delete conv;
2943             }
2944
2945             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2946
2947             for ( ; *names; ++names )
2948             {
2949                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2950                 if ( conv->IsOk() )
2951                 {
2952                     gs_nameCache[encoding] = *names;
2953                     return conv;
2954                 }
2955
2956                 delete conv;
2957             }
2958
2959             gs_nameCache[encoding] = _T(""); // cache the failure
2960         }
2961 #endif // wxUSE_FONTMAP
2962     }
2963 #endif // HAVE_ICONV
2964
2965 #ifdef wxHAVE_WIN32_MB2WC
2966     {
2967 #if wxUSE_FONTMAP
2968         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2969                                       : new wxMBConv_win32(m_encoding);
2970         if ( conv->IsOk() )
2971             return conv;
2972
2973         delete conv;
2974 #else
2975         return NULL;
2976 #endif
2977     }
2978 #endif // wxHAVE_WIN32_MB2WC
2979 #if defined(__WXMAC__)
2980     {
2981         // leave UTF16 and UTF32 to the built-ins of wx
2982         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2983             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2984         {
2985
2986 #if wxUSE_FONTMAP
2987             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2988                                         : new wxMBConv_mac(m_encoding);
2989 #else
2990             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2991 #endif
2992             if ( conv->IsOk() )
2993                  return conv;
2994
2995             delete conv;
2996         }
2997     }
2998 #endif
2999 #if defined(__WXCOCOA__)
3000     {
3001         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3002         {
3003
3004 #if wxUSE_FONTMAP
3005             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3006                                           : new wxMBConv_cocoa(m_encoding);
3007 #else
3008             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3009 #endif
3010             if ( conv->IsOk() )
3011                  return conv;
3012
3013             delete conv;
3014         }
3015     }
3016 #endif
3017     // step (2)
3018     wxFontEncoding enc = m_encoding;
3019 #if wxUSE_FONTMAP
3020     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3021     {
3022         // use "false" to suppress interactive dialogs -- we can be called from
3023         // anywhere and popping up a dialog from here is the last thing we want to
3024         // do
3025         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3026     }
3027 #endif // wxUSE_FONTMAP
3028
3029     switch ( enc )
3030     {
3031         case wxFONTENCODING_UTF7:
3032              return new wxMBConvUTF7;
3033
3034         case wxFONTENCODING_UTF8:
3035              return new wxMBConvUTF8;
3036
3037         case wxFONTENCODING_UTF16BE:
3038              return new wxMBConvUTF16BE;
3039
3040         case wxFONTENCODING_UTF16LE:
3041              return new wxMBConvUTF16LE;
3042
3043         case wxFONTENCODING_UTF32BE:
3044              return new wxMBConvUTF32BE;
3045
3046         case wxFONTENCODING_UTF32LE:
3047              return new wxMBConvUTF32LE;
3048
3049         default:
3050              // nothing to do but put here to suppress gcc warnings
3051              ;
3052     }
3053
3054     // step (3)
3055 #if wxUSE_FONTMAP
3056     {
3057         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3058                                       : new wxMBConv_wxwin(m_encoding);
3059         if ( conv->IsOk() )
3060             return conv;
3061
3062         delete conv;
3063     }
3064 #endif // wxUSE_FONTMAP
3065
3066     // NB: This is a hack to prevent deadlock. What could otherwise happen
3067     //     in Unicode build: wxConvLocal creation ends up being here
3068     //     because of some failure and logs the error. But wxLog will try to
3069     //     attach timestamp, for which it will need wxConvLocal (to convert
3070     //     time to char* and then wchar_t*), but that fails, tries to log
3071     //     error, but wxLog has a (already locked) critical section that
3072     //     guards static buffer.
3073     static bool alreadyLoggingError = false;
3074     if (!alreadyLoggingError)
3075     {
3076         alreadyLoggingError = true;
3077         wxLogError(_("Cannot convert from the charset '%s'!"),
3078                    m_name ? m_name
3079                       :
3080 #if wxUSE_FONTMAP
3081                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3082 #else // !wxUSE_FONTMAP
3083                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3084 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3085               );
3086         alreadyLoggingError = false;
3087     }
3088
3089     return NULL;
3090 }
3091
3092 void wxCSConv::CreateConvIfNeeded() const
3093 {
3094     if ( m_deferred )
3095     {
3096         wxCSConv *self = (wxCSConv *)this; // const_cast
3097
3098 #if wxUSE_INTL
3099         // if we don't have neither the name nor the encoding, use the default
3100         // encoding for this system
3101         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3102         {
3103             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3104         }
3105 #endif // wxUSE_INTL
3106
3107         self->m_convReal = DoCreate();
3108         self->m_deferred = false;
3109     }
3110 }
3111
3112 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3113 {
3114     CreateConvIfNeeded();
3115
3116     if (m_convReal)
3117         return m_convReal->MB2WC(buf, psz, n);
3118
3119     // latin-1 (direct)
3120     size_t len = strlen(psz);
3121
3122     if (buf)
3123     {
3124         for (size_t c = 0; c <= len; c++)
3125             buf[c] = (unsigned char)(psz[c]);
3126     }
3127
3128     return len;
3129 }
3130
3131 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3132 {
3133     CreateConvIfNeeded();
3134
3135     if (m_convReal)
3136         return m_convReal->WC2MB(buf, psz, n);
3137
3138     // latin-1 (direct)
3139     const size_t len = wxWcslen(psz);
3140     if (buf)
3141     {
3142         for (size_t c = 0; c <= len; c++)
3143         {
3144             if (psz[c] > 0xFF)
3145                 return (size_t)-1;
3146             buf[c] = (char)psz[c];
3147         }
3148     }
3149     else
3150     {
3151         for (size_t c = 0; c <= len; c++)
3152         {
3153             if (psz[c] > 0xFF)
3154                 return (size_t)-1;
3155         }
3156     }
3157
3158     return len;
3159 }
3160
3161 size_t wxCSConv::GetMBNulLen() const
3162 {
3163     CreateConvIfNeeded();
3164
3165     if ( m_convReal )
3166     {
3167         return m_convReal->GetMBNulLen();
3168     }
3169
3170     return 1;
3171 }
3172
3173 // ----------------------------------------------------------------------------
3174 // globals
3175 // ----------------------------------------------------------------------------
3176
3177 #ifdef __WINDOWS__
3178     static wxMBConv_win32 wxConvLibcObj;
3179 #elif defined(__WXMAC__) && !defined(__MACH__)
3180     static wxMBConv_mac wxConvLibcObj ;
3181 #else
3182     static wxMBConvLibc wxConvLibcObj;
3183 #endif
3184
3185 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3186 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3187 static wxMBConvUTF7 wxConvUTF7Obj;
3188 static wxMBConvUTF8 wxConvUTF8Obj;
3189
3190 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3191 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3192 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3193 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3194 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3195 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3196 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3197 #ifdef __WXOSX__
3198                                     wxConvUTF8Obj;
3199 #else
3200                                     wxConvLibcObj;
3201 #endif
3202
3203
3204 #else // !wxUSE_WCHAR_T
3205
3206 // stand-ins in absence of wchar_t
3207 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3208                                 wxConvISO8859_1,
3209                                 wxConvLocal,
3210                                 wxConvUTF8;
3211
3212 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T