src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  90 static bool NotAllNULs(const char *p, size_t n)
  91 {
  92     while ( n && *p++ == '\0' )
  93         n--;
  94
  95     return n != 0;
  96 }
  97
  98 // ----------------------------------------------------------------------------
  99 // UTF-16 en/decoding to/from UCS-4
 100 // ----------------------------------------------------------------------------
 101
 102
 103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 104 {
 105     if (input<=0xffff)
 106     {
 107         if (output)
 108             *output = (wxUint16) input;
 109         return 1;
 110     }
 111     else if (input>=0x110000)
 112     {
 113         return (size_t)-1;
 114     }
 115     else
 116     {
 117         if (output)
 118         {
 119             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 120             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 121         }
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input<0xd800) || (*input>0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 134     {
 135         output = *input;
 136         return (size_t)-1;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145
 146 // ----------------------------------------------------------------------------
 147 // wxMBConv
 148 // ----------------------------------------------------------------------------
 149
 150 size_t
 151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 152                   const char *src, size_t srcLen) const
 153 {
 154     // although new conversion classes are supposed to implement this function
 155     // directly, the existins ones only implement the old MB2WC() and so, to
 156     // avoid to have to rewrite all conversion classes at once, we provide a
 157     // default (but not efficient) implementation of this one in terms of the
 158     // old function by copying the input to ensure that it's NUL-terminated and
 159     // then using MB2WC() to convert it
 160
 161     // the number of chars [which would be] written to dst [if it were not NULL]
 162     size_t dstWritten = 0;
 163
 164     // the number of NULs terminating this string
 165     size_t nulLen wxDUMMY_INITIALIZE(0);
 166
 167     // if we were not given the input size we just have to assume that the
 168     // string is properly terminated as we have no way of knowing how long it
 169     // is anyhow, but if we do have the size check whether there are enough
 170     // NULs at the end
 171     wxCharBuffer bufTmp;
 172     const char *srcEnd;
 173     if ( srcLen != (size_t)-1 )
 174     {
 175         // we need to know how to find the end of this string
 176         nulLen = GetMBNulLen();
 177         if ( nulLen == wxCONV_FAILED )
 178             return wxCONV_FAILED;
 179
 180         // if there are enough NULs we can avoid the copy
 181         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 182         {
 183             // make a copy in order to properly NUL-terminate the string
 184             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 185             char * const p = bufTmp.data();
 186             memcpy(p, src, srcLen);
 187             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 188                 *s = '\0';
 189
 190             src = bufTmp;
 191         }
 192
 193         srcEnd = src + srcLen;
 194     }
 195     else // quit after the first loop iteration
 196     {
 197         srcEnd = NULL;
 198     }
 199
 200     for ( ;; )
 201     {
 202         // try to convert the current chunk
 203         size_t lenChunk = MB2WC(NULL, src, 0);
 204         if ( lenChunk == 0 )
 205         {
 206             // nothing left in the input string, conversion succeeded
 207             break;
 208         }
 209
 210         if ( lenChunk == wxCONV_FAILED )
 211             return wxCONV_FAILED;
 212
 213         // if we already have a previous chunk, leave the NUL separating it
 214         // from this one
 215         if ( dstWritten )
 216         {
 217             dstWritten++;
 218             if ( dst )
 219                 dst++;
 220         }
 221
 222         dstWritten += lenChunk;
 223
 224         if ( dst )
 225         {
 226             if ( dstWritten > dstLen )
 227                 return wxCONV_FAILED;
 228
 229             lenChunk = MB2WC(dst, src, lenChunk + 1 /* for NUL */);
 230             if ( lenChunk == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert the entire string in this cas, as we suppose that the
 239             // string is NUL-terminated and so srcEnd is not used at all
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == (size_t)-1 )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
 320 {
 321     size_t rc = ToWChar(out, outLen, in);
 322     if ( rc != wxCONV_FAILED )
 323     {
 324         // ToWChar() returns the buffer length, i.e. including the trailing
 325         // NUL, while this method doesn't take it into account
 326         rc--;
 327     }
 328
 329     return rc;
 330 }
 331
 332 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
 333 {
 334     size_t rc = FromWChar(out, outLen, in);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         rc -= GetMBNulLen();
 338     }
 339
 340     return rc;
 341 }
 342
 343 wxMBConv::~wxMBConv()
 344 {
 345     // nothing to do here (necessary for Darwin linking probably)
 346 }
 347
 348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 349 {
 350     if ( psz )
 351     {
 352         // calculate the length of the buffer needed first
 353         const size_t nLen = MB2WC(NULL, psz, 0);
 354         if ( nLen != wxCONV_FAILED )
 355         {
 356             // now do the actual conversion
 357             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 358
 359             // +1 for the trailing NULL
 360             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 361                 return buf;
 362         }
 363     }
 364
 365     return wxWCharBuffer();
 366 }
 367
 368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 369 {
 370     if ( pwz )
 371     {
 372         const size_t nLen = WC2MB(NULL, pwz, 0);
 373         if ( nLen != wxCONV_FAILED )
 374         {
 375             // extra space for trailing NUL(s)
 376             static const size_t extraLen = GetMaxMBNulLen();
 377
 378             wxCharBuffer buf(nLen + extraLen - 1);
 379             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 380                 return buf;
 381         }
 382     }
 383
 384     return wxCharBuffer();
 385 }
 386
 387 const wxWCharBuffer
 388 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 389 {
 390     const size_t dstLen = ToWChar(NULL, 0, in, inLen);
 391     if ( dstLen != wxCONV_FAILED )
 392     {
 393         wxWCharBuffer wbuf(dstLen);
 394         if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
 395         {
 396             if ( outLen )
 397                 *outLen = dstLen;
 398             return wbuf;
 399         }
 400     }
 401
 402     if ( outLen )
 403         *outLen = 0;
 404
 405     return wxWCharBuffer();
 406 }
 407
 408 const wxCharBuffer
 409 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 410 {
 411     const size_t dstLen = FromWChar(NULL, 0, in, inLen);
 412     if ( dstLen != wxCONV_FAILED )
 413     {
 414         wxCharBuffer buf(dstLen);
 415         if ( FromWChar(buf.data(), dstLen, in, inLen) )
 416         {
 417             if ( outLen )
 418                 *outLen = dstLen;
 419             return buf;
 420         }
 421     }
 422
 423     if ( outLen )
 424         *outLen = 0;
 425
 426     return wxCharBuffer();
 427 }
 428
 429 // ----------------------------------------------------------------------------
 430 // wxMBConvLibc
 431 // ----------------------------------------------------------------------------
 432
 433 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 434 {
 435     return wxMB2WC(buf, psz, n);
 436 }
 437
 438 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 439 {
 440     return wxWC2MB(buf, psz, n);
 441 }
 442
 443 // ----------------------------------------------------------------------------
 444 // wxConvBrokenFileNames
 445 // ----------------------------------------------------------------------------
 446
 447 #ifdef __UNIX__
 448
 449 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 450 {
 451     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 452                   || wxStricmp(charset, _T("UTF8")) == 0  )
 453         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 454     else
 455         m_conv = new wxCSConv(charset);
 456 }
 457
 458 #endif // __UNIX__
 459
 460 // ----------------------------------------------------------------------------
 461 // UTF-7
 462 // ----------------------------------------------------------------------------
 463
 464 // Implementation (C) 2004 Fredrik Roubert
 465
 466 //
 467 // BASE64 decoding table
 468 //
 469 static const unsigned char utf7unb64[] =
 470 {
 471     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 472     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 473     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 474     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 475     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 476     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 477     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 478     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 479     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 480     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 481     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 482     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 483     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 484     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 485     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 486     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 487     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 488     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 489     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 490     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 500     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 502     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 503 };
 504
 505 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 506 {
 507     size_t len = 0;
 508
 509     while ( *psz && (!buf || (len < n)) )
 510     {
 511         unsigned char cc = *psz++;
 512         if (cc != '+')
 513         {
 514             // plain ASCII char
 515             if (buf)
 516                 *buf++ = cc;
 517             len++;
 518         }
 519         else if (*psz == '-')
 520         {
 521             // encoded plus sign
 522             if (buf)
 523                 *buf++ = cc;
 524             len++;
 525             psz++;
 526         }
 527         else // start of BASE64 encoded string
 528         {
 529             bool lsb, ok;
 530             unsigned int d, l;
 531             for ( ok = lsb = false, d = 0, l = 0;
 532                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 533                   psz++ )
 534             {
 535                 d <<= 6;
 536                 d += cc;
 537                 for (l += 6; l >= 8; lsb = !lsb)
 538                 {
 539                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 540                     if (lsb)
 541                     {
 542                         if (buf)
 543                             *buf++ |= c;
 544                         len ++;
 545                     }
 546                     else
 547                     {
 548                         if (buf)
 549                             *buf = (wchar_t)(c << 8);
 550                     }
 551
 552                     ok = true;
 553                 }
 554             }
 555
 556             if ( !ok )
 557             {
 558                 // in valid UTF7 we should have valid characters after '+'
 559                 return (size_t)-1;
 560             }
 561
 562             if (*psz == '-')
 563                 psz++;
 564         }
 565     }
 566
 567     if ( buf && (len < n) )
 568         *buf = '\0';
 569
 570     return len;
 571 }
 572
 573 //
 574 // BASE64 encoding table
 575 //
 576 static const unsigned char utf7enb64[] =
 577 {
 578     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 579     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 580     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 581     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 582     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 583     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 584     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 585     '4', '5', '6', '7', '8', '9', '+', '/'
 586 };
 587
 588 //
 589 // UTF-7 encoding table
 590 //
 591 // 0 - Set D (directly encoded characters)
 592 // 1 - Set O (optional direct characters)
 593 // 2 - whitespace characters (optional)
 594 // 3 - special characters
 595 //
 596 static const unsigned char utf7encode[128] =
 597 {
 598     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 599     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 600     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 601     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 602     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 603     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 604     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 605     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 606 };
 607
 608 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 609 {
 610     size_t len = 0;
 611
 612     while (*psz && ((!buf) || (len < n)))
 613     {
 614         wchar_t cc = *psz++;
 615         if (cc < 0x80 && utf7encode[cc] < 1)
 616         {
 617             // plain ASCII char
 618             if (buf)
 619                 *buf++ = (char)cc;
 620             len++;
 621         }
 622 #ifndef WC_UTF16
 623         else if (((wxUint32)cc) > 0xffff)
 624         {
 625             // no surrogate pair generation (yet?)
 626             return (size_t)-1;
 627         }
 628 #endif
 629         else
 630         {
 631             if (buf)
 632                 *buf++ = '+';
 633             len++;
 634             if (cc != '+')
 635             {
 636                 // BASE64 encode string
 637                 unsigned int lsb, d, l;
 638                 for (d = 0, l = 0; /*nothing*/; psz++)
 639                 {
 640                     for (lsb = 0; lsb < 2; lsb ++)
 641                     {
 642                         d <<= 8;
 643                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 644
 645                         for (l += 8; l >= 6; )
 646                         {
 647                             l -= 6;
 648                             if (buf)
 649                                 *buf++ = utf7enb64[(d >> l) % 64];
 650                             len++;
 651                         }
 652                     }
 653                     cc = *psz;
 654                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 655                         break;
 656                 }
 657                 if (l != 0)
 658                 {
 659                     if (buf)
 660                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 661                     len++;
 662                 }
 663             }
 664             if (buf)
 665                 *buf++ = '-';
 666             len++;
 667         }
 668     }
 669     if (buf && (len < n))
 670         *buf = 0;
 671     return len;
 672 }
 673
 674 // ----------------------------------------------------------------------------
 675 // UTF-8
 676 // ----------------------------------------------------------------------------
 677
 678 static wxUint32 utf8_max[]=
 679     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 680
 681 // boundaries of the private use area we use to (temporarily) remap invalid
 682 // characters invalid in a UTF-8 encoded string
 683 const wxUint32 wxUnicodePUA = 0x100000;
 684 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 685
 686 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 687 {
 688     size_t len = 0;
 689
 690     while (*psz && ((!buf) || (len < n)))
 691     {
 692         const char *opsz = psz;
 693         bool invalid = false;
 694         unsigned char cc = *psz++, fc = cc;
 695         unsigned cnt;
 696         for (cnt = 0; fc & 0x80; cnt++)
 697             fc <<= 1;
 698         if (!cnt)
 699         {
 700             // plain ASCII char
 701             if (buf)
 702                 *buf++ = cc;
 703             len++;
 704
 705             // escape the escape character for octal escapes
 706             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 707                     && cc == '\\' && (!buf || len < n))
 708             {
 709                 if (buf)
 710                     *buf++ = cc;
 711                 len++;
 712             }
 713         }
 714         else
 715         {
 716             cnt--;
 717             if (!cnt)
 718             {
 719                 // invalid UTF-8 sequence
 720                 invalid = true;
 721             }
 722             else
 723             {
 724                 unsigned ocnt = cnt - 1;
 725                 wxUint32 res = cc & (0x3f >> cnt);
 726                 while (cnt--)
 727                 {
 728                     cc = *psz;
 729                     if ((cc & 0xC0) != 0x80)
 730                     {
 731                         // invalid UTF-8 sequence
 732                         invalid = true;
 733                         break;
 734                     }
 735                     psz++;
 736                     res = (res << 6) | (cc & 0x3f);
 737                 }
 738                 if (invalid || res <= utf8_max[ocnt])
 739                 {
 740                     // illegal UTF-8 encoding
 741                     invalid = true;
 742                 }
 743                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 744                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 745                 {
 746                     // if one of our PUA characters turns up externally
 747                     // it must also be treated as an illegal sequence
 748                     // (a bit like you have to escape an escape character)
 749                     invalid = true;
 750                 }
 751                 else
 752                 {
 753 #ifdef WC_UTF16
 754                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 755                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 756                     if (pa == (size_t)-1)
 757                     {
 758                         invalid = true;
 759                     }
 760                     else
 761                     {
 762                         if (buf)
 763                             buf += pa;
 764                         len += pa;
 765                     }
 766 #else // !WC_UTF16
 767                     if (buf)
 768                         *buf++ = (wchar_t)res;
 769                     len++;
 770 #endif // WC_UTF16/!WC_UTF16
 771                 }
 772             }
 773             if (invalid)
 774             {
 775                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 776                 {
 777                     while (opsz < psz && (!buf || len < n))
 778                     {
 779 #ifdef WC_UTF16
 780                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 781                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 782                         wxASSERT(pa != (size_t)-1);
 783                         if (buf)
 784                             buf += pa;
 785                         opsz++;
 786                         len += pa;
 787 #else
 788                         if (buf)
 789                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 790                         opsz++;
 791                         len++;
 792 #endif
 793                     }
 794                 }
 795                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 796                 {
 797                     while (opsz < psz && (!buf || len < n))
 798                     {
 799                         if ( buf && len + 3 < n )
 800                         {
 801                             unsigned char on = *opsz;
 802                             *buf++ = L'\\';
 803                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 804                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 805                             *buf++ = (wchar_t)( L'0' + on % 010 );
 806                         }
 807                         opsz++;
 808                         len += 4;
 809                     }
 810                 }
 811                 else // MAP_INVALID_UTF8_NOT
 812                 {
 813                     return (size_t)-1;
 814                 }
 815             }
 816         }
 817     }
 818     if (buf && (len < n))
 819         *buf = 0;
 820     return len;
 821 }
 822
 823 static inline bool isoctal(wchar_t wch)
 824 {
 825     return L'0' <= wch && wch <= L'7';
 826 }
 827
 828 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 829 {
 830     size_t len = 0;
 831
 832     while (*psz && ((!buf) || (len < n)))
 833     {
 834         wxUint32 cc;
 835 #ifdef WC_UTF16
 836         // cast is ok for WC_UTF16
 837         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 838         psz += (pa == (size_t)-1) ? 1 : pa;
 839 #else
 840         cc=(*psz++) & 0x7fffffff;
 841 #endif
 842
 843         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 844                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 845         {
 846             if (buf)
 847                 *buf++ = (char)(cc - wxUnicodePUA);
 848             len++;
 849         }
 850         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 851                     && cc == L'\\' && psz[0] == L'\\' )
 852         {
 853             if (buf)
 854                 *buf++ = (char)cc;
 855             psz++;
 856             len++;
 857         }
 858         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 859                     cc == L'\\' &&
 860                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 861         {
 862             if (buf)
 863             {
 864                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 865                                  (psz[1] - L'0')*010 +
 866                                  (psz[2] - L'0'));
 867             }
 868
 869             psz += 3;
 870             len++;
 871         }
 872         else
 873         {
 874             unsigned cnt;
 875             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 876             if (!cnt)
 877             {
 878                 // plain ASCII char
 879                 if (buf)
 880                     *buf++ = (char) cc;
 881                 len++;
 882             }
 883
 884             else
 885             {
 886                 len += cnt + 1;
 887                 if (buf)
 888                 {
 889                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 890                     while (cnt--)
 891                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 892                 }
 893             }
 894         }
 895     }
 896
 897     if (buf && (len<n))
 898         *buf = 0;
 899
 900     return len;
 901 }
 902
 903 // ----------------------------------------------------------------------------
 904 // UTF-16
 905 // ----------------------------------------------------------------------------
 906
 907 #ifdef WORDS_BIGENDIAN
 908     #define wxMBConvUTF16straight wxMBConvUTF16BE
 909     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 910 #else
 911     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 912     #define wxMBConvUTF16straight wxMBConvUTF16LE
 913 #endif
 914
 915
 916 #ifdef WC_UTF16
 917
 918 // copy 16bit MB to 16bit String
 919 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 920 {
 921     size_t len=0;
 922
 923     while (*(wxUint16*)psz && (!buf || len < n))
 924     {
 925         if (buf)
 926             *buf++ = *(wxUint16*)psz;
 927         len++;
 928
 929         psz += sizeof(wxUint16);
 930     }
 931     if (buf && len<n)   *buf=0;
 932
 933     return len;
 934 }
 935
 936
 937 // copy 16bit String to 16bit MB
 938 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 939 {
 940     size_t len=0;
 941
 942     while (*psz && (!buf || len < n))
 943     {
 944         if (buf)
 945         {
 946             *(wxUint16*)buf = *psz;
 947             buf += sizeof(wxUint16);
 948         }
 949         len += sizeof(wxUint16);
 950         psz++;
 951     }
 952     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 953
 954     return len;
 955 }
 956
 957
 958 // swap 16bit MB to 16bit String
 959 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 960 {
 961     size_t len = 0;
 962
 963     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 964     // inside the string
 965     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 966     {
 967         if ( buf )
 968         {
 969             ((char *)buf)[0] = psz[1];
 970             ((char *)buf)[1] = psz[0];
 971             buf++;
 972         }
 973         len++;
 974         psz += 2;
 975     }
 976
 977     if ( buf && len < n )
 978         *buf = L'\0';
 979
 980     return len;
 981 }
 982
 983
 984 // swap 16bit MB to 16bit String
 985 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 986 {
 987     size_t len = 0;
 988
 989     while ( *psz && (!buf || len < n) )
 990     {
 991         if ( buf )
 992         {
 993             *buf++ = ((char*)psz)[1];
 994             *buf++ = ((char*)psz)[0];
 995         }
 996         len += 2;
 997         psz++;
 998     }
 999
1000     if ( buf && len < n )
1001         *buf = '\0';
1002
1003     return len;
1004 }
1005
1006
1007 #else // WC_UTF16
1008
1009
1010 // copy 16bit MB to 32bit String
1011 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1012 {
1013     size_t len=0;
1014
1015     while (*(wxUint16*)psz && (!buf || len < n))
1016     {
1017         wxUint32 cc;
1018         size_t pa=decode_utf16((wxUint16*)psz, cc);
1019         if (pa == (size_t)-1)
1020             return pa;
1021
1022         if (buf)
1023             *buf++ = (wchar_t)cc;
1024         len++;
1025         psz += pa * sizeof(wxUint16);
1026     }
1027     if (buf && len<n)   *buf=0;
1028
1029     return len;
1030 }
1031
1032
1033 // copy 32bit String to 16bit MB
1034 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1035 {
1036     size_t len=0;
1037
1038     while (*psz && (!buf || len < n))
1039     {
1040         wxUint16 cc[2];
1041         size_t pa=encode_utf16(*psz, cc);
1042
1043         if (pa == (size_t)-1)
1044             return pa;
1045
1046         if (buf)
1047         {
1048             *(wxUint16*)buf = cc[0];
1049             buf += sizeof(wxUint16);
1050             if (pa > 1)
1051             {
1052                 *(wxUint16*)buf = cc[1];
1053                 buf += sizeof(wxUint16);
1054             }
1055         }
1056
1057         len += pa*sizeof(wxUint16);
1058         psz++;
1059     }
1060     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1061
1062     return len;
1063 }
1064
1065
1066 // swap 16bit MB to 32bit String
1067 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1068 {
1069     size_t len=0;
1070
1071     while (*(wxUint16*)psz && (!buf || len < n))
1072     {
1073         wxUint32 cc;
1074         char tmp[4];
1075         tmp[0]=psz[1];  tmp[1]=psz[0];
1076         tmp[2]=psz[3];  tmp[3]=psz[2];
1077
1078         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1079         if (pa == (size_t)-1)
1080             return pa;
1081
1082         if (buf)
1083             *buf++ = (wchar_t)cc;
1084
1085         len++;
1086         psz += pa * sizeof(wxUint16);
1087     }
1088     if (buf && len<n)   *buf=0;
1089
1090     return len;
1091 }
1092
1093
1094 // swap 32bit String to 16bit MB
1095 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1096 {
1097     size_t len=0;
1098
1099     while (*psz && (!buf || len < n))
1100     {
1101         wxUint16 cc[2];
1102         size_t pa=encode_utf16(*psz, cc);
1103
1104         if (pa == (size_t)-1)
1105             return pa;
1106
1107         if (buf)
1108         {
1109             *buf++ = ((char*)cc)[1];
1110             *buf++ = ((char*)cc)[0];
1111             if (pa > 1)
1112             {
1113                 *buf++ = ((char*)cc)[3];
1114                 *buf++ = ((char*)cc)[2];
1115             }
1116         }
1117
1118         len += pa*sizeof(wxUint16);
1119         psz++;
1120     }
1121     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1122
1123     return len;
1124 }
1125
1126 #endif // WC_UTF16
1127
1128
1129 // ----------------------------------------------------------------------------
1130 // UTF-32
1131 // ----------------------------------------------------------------------------
1132
1133 #ifdef WORDS_BIGENDIAN
1134 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1135 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1136 #else
1137 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1138 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1139 #endif
1140
1141
1142 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1143 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1144
1145
1146 #ifdef WC_UTF16
1147
1148 // copy 32bit MB to 16bit String
1149 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1150 {
1151     size_t len=0;
1152
1153     while (*(wxUint32*)psz && (!buf || len < n))
1154     {
1155         wxUint16 cc[2];
1156
1157         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1158         if (pa == (size_t)-1)
1159             return pa;
1160
1161         if (buf)
1162         {
1163             *buf++ = cc[0];
1164             if (pa > 1)
1165                 *buf++ = cc[1];
1166         }
1167         len += pa;
1168         psz += sizeof(wxUint32);
1169     }
1170     if (buf && len<n)   *buf=0;
1171
1172     return len;
1173 }
1174
1175
1176 // copy 16bit String to 32bit MB
1177 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1178 {
1179     size_t len=0;
1180
1181     while (*psz && (!buf || len < n))
1182     {
1183         wxUint32 cc;
1184
1185         // cast is ok for WC_UTF16
1186         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1187         if (pa == (size_t)-1)
1188             return pa;
1189
1190         if (buf)
1191         {
1192             *(wxUint32*)buf = cc;
1193             buf += sizeof(wxUint32);
1194         }
1195         len += sizeof(wxUint32);
1196         psz += pa;
1197     }
1198
1199     if (buf && len<=n-sizeof(wxUint32))
1200         *(wxUint32*)buf=0;
1201
1202     return len;
1203 }
1204
1205
1206
1207 // swap 32bit MB to 16bit String
1208 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1209 {
1210     size_t len=0;
1211
1212     while (*(wxUint32*)psz && (!buf || len < n))
1213     {
1214         char tmp[4];
1215         tmp[0] = psz[3];   tmp[1] = psz[2];
1216         tmp[2] = psz[1];   tmp[3] = psz[0];
1217
1218
1219         wxUint16 cc[2];
1220
1221         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1222         if (pa == (size_t)-1)
1223             return pa;
1224
1225         if (buf)
1226         {
1227             *buf++ = cc[0];
1228             if (pa > 1)
1229                 *buf++ = cc[1];
1230         }
1231         len += pa;
1232         psz += sizeof(wxUint32);
1233     }
1234
1235     if (buf && len<n)
1236         *buf=0;
1237
1238     return len;
1239 }
1240
1241
1242 // swap 16bit String to 32bit MB
1243 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1244 {
1245     size_t len=0;
1246
1247     while (*psz && (!buf || len < n))
1248     {
1249         char cc[4];
1250
1251         // cast is ok for WC_UTF16
1252         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1253         if (pa == (size_t)-1)
1254             return pa;
1255
1256         if (buf)
1257         {
1258             *buf++ = cc[3];
1259             *buf++ = cc[2];
1260             *buf++ = cc[1];
1261             *buf++ = cc[0];
1262         }
1263         len += sizeof(wxUint32);
1264         psz += pa;
1265     }
1266
1267     if (buf && len<=n-sizeof(wxUint32))
1268         *(wxUint32*)buf=0;
1269
1270     return len;
1271 }
1272
1273 #else // WC_UTF16
1274
1275
1276 // copy 32bit MB to 32bit String
1277 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1278 {
1279     size_t len=0;
1280
1281     while (*(wxUint32*)psz && (!buf || len < n))
1282     {
1283         if (buf)
1284             *buf++ = (wchar_t)(*(wxUint32*)psz);
1285         len++;
1286         psz += sizeof(wxUint32);
1287     }
1288
1289     if (buf && len<n)
1290         *buf=0;
1291
1292     return len;
1293 }
1294
1295
1296 // copy 32bit String to 32bit MB
1297 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1298 {
1299     size_t len=0;
1300
1301     while (*psz && (!buf || len < n))
1302     {
1303         if (buf)
1304         {
1305             *(wxUint32*)buf = *psz;
1306             buf += sizeof(wxUint32);
1307         }
1308
1309         len += sizeof(wxUint32);
1310         psz++;
1311     }
1312
1313     if (buf && len<=n-sizeof(wxUint32))
1314         *(wxUint32*)buf=0;
1315
1316     return len;
1317 }
1318
1319
1320 // swap 32bit MB to 32bit String
1321 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1322 {
1323     size_t len=0;
1324
1325     while (*(wxUint32*)psz && (!buf || len < n))
1326     {
1327         if (buf)
1328         {
1329             ((char *)buf)[0] = psz[3];
1330             ((char *)buf)[1] = psz[2];
1331             ((char *)buf)[2] = psz[1];
1332             ((char *)buf)[3] = psz[0];
1333             buf++;
1334         }
1335         len++;
1336         psz += sizeof(wxUint32);
1337     }
1338
1339     if (buf && len<n)
1340         *buf=0;
1341
1342     return len;
1343 }
1344
1345
1346 // swap 32bit String to 32bit MB
1347 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1348 {
1349     size_t len=0;
1350
1351     while (*psz && (!buf || len < n))
1352     {
1353         if (buf)
1354         {
1355             *buf++ = ((char *)psz)[3];
1356             *buf++ = ((char *)psz)[2];
1357             *buf++ = ((char *)psz)[1];
1358             *buf++ = ((char *)psz)[0];
1359         }
1360         len += sizeof(wxUint32);
1361         psz++;
1362     }
1363
1364     if (buf && len<=n-sizeof(wxUint32))
1365         *(wxUint32*)buf=0;
1366
1367     return len;
1368 }
1369
1370
1371 #endif // WC_UTF16
1372
1373
1374 // ============================================================================
1375 // The classes doing conversion using the iconv_xxx() functions
1376 // ============================================================================
1377
1378 #ifdef HAVE_ICONV
1379
1380 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1381 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1382 //     (unless there's yet another bug in glibc) the only case when iconv()
1383 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1384 //     left in the input buffer -- when _real_ error occurs,
1385 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1386 //     iconv() failure.
1387 //     [This bug does not appear in glibc 2.2.]
1388 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1389 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1390                                      (errno != E2BIG || bufLeft != 0))
1391 #else
1392 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1393 #endif
1394
1395 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1396
1397 #define ICONV_T_INVALID ((iconv_t)-1)
1398
1399 #if SIZEOF_WCHAR_T == 4
1400     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1401     #define WC_ENC      wxFONTENCODING_UTF32
1402 #elif SIZEOF_WCHAR_T == 2
1403     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1404     #define WC_ENC      wxFONTENCODING_UTF16
1405 #else // sizeof(wchar_t) != 2 nor 4
1406     // does this ever happen?
1407     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1408 #endif
1409
1410 // ----------------------------------------------------------------------------
1411 // wxMBConv_iconv: encapsulates an iconv character set
1412 // ----------------------------------------------------------------------------
1413
1414 class wxMBConv_iconv : public wxMBConv
1415 {
1416 public:
1417     wxMBConv_iconv(const wxChar *name);
1418     virtual ~wxMBConv_iconv();
1419
1420     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1421     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1422
1423     // classify this encoding as explained in wxMBConv::GetMBNulLen()
1424     // comment
1425     virtual size_t GetMBNulLen() const;
1426
1427     bool IsOk() const
1428         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1429
1430 protected:
1431     // the iconv handlers used to translate from multibyte to wide char and in
1432     // the other direction
1433     iconv_t m2w,
1434             w2m;
1435 #if wxUSE_THREADS
1436     // guards access to m2w and w2m objects
1437     wxMutex m_iconvMutex;
1438 #endif
1439
1440 private:
1441     // the name (for iconv_open()) of a wide char charset -- if none is
1442     // available on this machine, it will remain NULL
1443     static wxString ms_wcCharsetName;
1444
1445     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1446     // different endian-ness than the native one
1447     static bool ms_wcNeedsSwap;
1448
1449     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1450     // initially
1451     size_t m_minMBCharWidth;
1452 };
1453
1454 // make the constructor available for unit testing
1455 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1456 {
1457     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1458     if ( !result->IsOk() )
1459     {
1460         delete result;
1461         return 0;
1462     }
1463     return result;
1464 }
1465
1466 wxString wxMBConv_iconv::ms_wcCharsetName;
1467 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1468
1469 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1470 {
1471     m_minMBCharWidth = 0;
1472
1473     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1474     // names for the charsets
1475     const wxCharBuffer cname(wxString(name).ToAscii());
1476
1477     // check for charset that represents wchar_t:
1478     if ( ms_wcCharsetName.empty() )
1479     {
1480         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1481
1482 #if wxUSE_FONTMAP
1483         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1484 #else // !wxUSE_FONTMAP
1485         static const wxChar *names[] =
1486         {
1487 #if SIZEOF_WCHAR_T == 4
1488             _T("UCS-4"),
1489 #elif SIZEOF_WCHAR_T = 2
1490             _T("UCS-2"),
1491 #endif
1492             NULL
1493         };
1494 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1495
1496         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1497         {
1498             const wxString nameCS(*names);
1499
1500             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1501             wxString nameXE(nameCS);
1502             #ifdef WORDS_BIGENDIAN
1503                 nameXE += _T("BE");
1504             #else // little endian
1505                 nameXE += _T("LE");
1506             #endif
1507
1508             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1509                        nameXE.c_str());
1510
1511             m2w = iconv_open(nameXE.ToAscii(), cname);
1512             if ( m2w == ICONV_T_INVALID )
1513             {
1514                 // try charset w/o bytesex info (e.g. "UCS4")
1515                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1516                            nameCS.c_str());
1517                 m2w = iconv_open(nameCS.ToAscii(), cname);
1518
1519                 // and check for bytesex ourselves:
1520                 if ( m2w != ICONV_T_INVALID )
1521                 {
1522                     char    buf[2], *bufPtr;
1523                     wchar_t wbuf[2], *wbufPtr;
1524                     size_t  insz, outsz;
1525                     size_t  res;
1526
1527                     buf[0] = 'A';
1528                     buf[1] = 0;
1529                     wbuf[0] = 0;
1530                     insz = 2;
1531                     outsz = SIZEOF_WCHAR_T * 2;
1532                     wbufPtr = wbuf;
1533                     bufPtr = buf;
1534
1535                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1536                                 (char**)&wbufPtr, &outsz);
1537
1538                     if (ICONV_FAILED(res, insz))
1539                     {
1540                         wxLogLastError(wxT("iconv"));
1541                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1542                                    nameCS.c_str());
1543                     }
1544                     else // ok, can convert to this encoding, remember it
1545                     {
1546                         ms_wcCharsetName = nameCS;
1547                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1548                     }
1549                 }
1550             }
1551             else // use charset not requiring byte swapping
1552             {
1553                 ms_wcCharsetName = nameXE;
1554             }
1555         }
1556
1557         wxLogTrace(TRACE_STRCONV,
1558                    wxT("iconv wchar_t charset is \"%s\"%s"),
1559                    ms_wcCharsetName.empty() ? _T("<none>")
1560                                             : ms_wcCharsetName.c_str(),
1561                    ms_wcNeedsSwap ? _T(" (needs swap)")
1562                                   : _T(""));
1563     }
1564     else // we already have ms_wcCharsetName
1565     {
1566         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1567     }
1568
1569     if ( ms_wcCharsetName.empty() )
1570     {
1571         w2m = ICONV_T_INVALID;
1572     }
1573     else
1574     {
1575         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1576         if ( w2m == ICONV_T_INVALID )
1577         {
1578             wxLogTrace(TRACE_STRCONV,
1579                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1580                        ms_wcCharsetName.c_str(), cname.data());
1581         }
1582     }
1583 }
1584
1585 wxMBConv_iconv::~wxMBConv_iconv()
1586 {
1587     if ( m2w != ICONV_T_INVALID )
1588         iconv_close(m2w);
1589     if ( w2m != ICONV_T_INVALID )
1590         iconv_close(w2m);
1591 }
1592
1593 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1594 {
1595     // find the string length: notice that must be done differently for
1596     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1597     size_t inbuf;
1598     const size_t nulLen = GetMBNulLen();
1599     switch ( nulLen )
1600     {
1601         default:
1602             return (size_t)-1;
1603
1604         case 1:
1605             inbuf = strlen(psz); // arguably more optimized than our version
1606             break;
1607
1608         case 2:
1609         case 4:
1610             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1611             // they also have to start at character boundary and not span two
1612             // adjacent characters
1613             const char *p;
1614             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1615                 ;
1616             inbuf = p - psz;
1617             break;
1618     }
1619
1620 #if wxUSE_THREADS
1621     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1622     //     Unfortunately there is a couple of global wxCSConv objects such as
1623     //     wxConvLocal that are used all over wx code, so we have to make sure
1624     //     the handle is used by at most one thread at the time. Otherwise
1625     //     only a few wx classes would be safe to use from non-main threads
1626     //     as MB<->WC conversion would fail "randomly".
1627     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1628 #endif // wxUSE_THREADS
1629
1630
1631     size_t outbuf = n * SIZEOF_WCHAR_T;
1632     size_t res, cres;
1633     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1634     wchar_t *bufPtr = buf;
1635     const char *pszPtr = psz;
1636
1637     if (buf)
1638     {
1639         // have destination buffer, convert there
1640         cres = iconv(m2w,
1641                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1642                      (char**)&bufPtr, &outbuf);
1643         res = n - (outbuf / SIZEOF_WCHAR_T);
1644
1645         if (ms_wcNeedsSwap)
1646         {
1647             // convert to native endianness
1648             for ( unsigned i = 0; i < res; i++ )
1649                 buf[n] = WC_BSWAP(buf[i]);
1650         }
1651
1652         // NUL-terminate the string if there is any space left
1653         if (res < n)
1654             buf[res] = 0;
1655     }
1656     else
1657     {
1658         // no destination buffer... convert using temp buffer
1659         // to calculate destination buffer requirement
1660         wchar_t tbuf[8];
1661         res = 0;
1662         do {
1663             bufPtr = tbuf;
1664             outbuf = 8*SIZEOF_WCHAR_T;
1665
1666             cres = iconv(m2w,
1667                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1668                          (char**)&bufPtr, &outbuf );
1669
1670             res += 8-(outbuf/SIZEOF_WCHAR_T);
1671         } while ((cres==(size_t)-1) && (errno==E2BIG));
1672     }
1673
1674     if (ICONV_FAILED(cres, inbuf))
1675     {
1676         //VS: it is ok if iconv fails, hence trace only
1677         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1678         return (size_t)-1;
1679     }
1680
1681     return res;
1682 }
1683
1684 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1685 {
1686 #if wxUSE_THREADS
1687     // NB: explained in MB2WC
1688     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1689 #endif
1690
1691     size_t inlen = wxWcslen(psz);
1692     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1693     size_t outbuf = n;
1694     size_t res, cres;
1695
1696     wchar_t *tmpbuf = 0;
1697
1698     if (ms_wcNeedsSwap)
1699     {
1700         // need to copy to temp buffer to switch endianness
1701         // (doing WC_BSWAP twice on the original buffer won't help, as it
1702         //  could be in read-only memory, or be accessed in some other thread)
1703         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1704         for ( size_t i = 0; i < inlen; i++ )
1705             tmpbuf[n] = WC_BSWAP(psz[i]);
1706         tmpbuf[inlen] = L'\0';
1707         psz = tmpbuf;
1708     }
1709
1710     if (buf)
1711     {
1712         // have destination buffer, convert there
1713         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1714
1715         res = n-outbuf;
1716
1717         // NB: iconv was given only wcslen(psz) characters on input, and so
1718         //     it couldn't convert the trailing zero. Let's do it ourselves
1719         //     if there's some room left for it in the output buffer.
1720         if (res < n)
1721             buf[0] = 0;
1722     }
1723     else
1724     {
1725         // no destination buffer... convert using temp buffer
1726         // to calculate destination buffer requirement
1727         char tbuf[16];
1728         res = 0;
1729         do {
1730             buf = tbuf; outbuf = 16;
1731
1732             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1733
1734             res += 16 - outbuf;
1735         } while ((cres==(size_t)-1) && (errno==E2BIG));
1736     }
1737
1738     if (ms_wcNeedsSwap)
1739     {
1740         free(tmpbuf);
1741     }
1742
1743     if (ICONV_FAILED(cres, inbuf))
1744     {
1745         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1746         return (size_t)-1;
1747     }
1748
1749     return res;
1750 }
1751
1752 size_t wxMBConv_iconv::GetMBNulLen() const
1753 {
1754     if ( m_minMBCharWidth == 0 )
1755     {
1756         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1757
1758 #if wxUSE_THREADS
1759         // NB: explained in MB2WC
1760         wxMutexLocker lock(self->m_iconvMutex);
1761 #endif
1762
1763         wchar_t *wnul = L"";
1764         char buf[8]; // should be enough for NUL in any encoding
1765         size_t inLen = sizeof(wchar_t),
1766                outLen = WXSIZEOF(buf);
1767         char *in = (char *)wnul;
1768         char *out = buf;
1769         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1770         {
1771             self->m_minMBCharWidth = (size_t)-1;
1772         }
1773         else // ok
1774         {
1775             self->m_minMBCharWidth = out - buf;
1776         }
1777     }
1778
1779     return m_minMBCharWidth;
1780 }
1781
1782 #endif // HAVE_ICONV
1783
1784
1785 // ============================================================================
1786 // Win32 conversion classes
1787 // ============================================================================
1788
1789 #ifdef wxHAVE_WIN32_MB2WC
1790
1791 // from utils.cpp
1792 #if wxUSE_FONTMAP
1793 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1794 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1795 #endif
1796
1797 class wxMBConv_win32 : public wxMBConv
1798 {
1799 public:
1800     wxMBConv_win32()
1801     {
1802         m_CodePage = CP_ACP;
1803         m_minMBCharWidth = 0;
1804     }
1805
1806 #if wxUSE_FONTMAP
1807     wxMBConv_win32(const wxChar* name)
1808     {
1809         m_CodePage = wxCharsetToCodepage(name);
1810         m_minMBCharWidth = 0;
1811     }
1812
1813     wxMBConv_win32(wxFontEncoding encoding)
1814     {
1815         m_CodePage = wxEncodingToCodepage(encoding);
1816         m_minMBCharWidth = 0;
1817     }
1818 #endif // wxUSE_FONTMAP
1819
1820     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1821     {
1822         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1823         // the behaviour is not compatible with the Unix version (using iconv)
1824         // and break the library itself, e.g. wxTextInputStream::NextChar()
1825         // wouldn't work if reading an incomplete MB char didn't result in an
1826         // error
1827         //
1828         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1829         // an error (tested under Windows Server 2003) and apparently it is
1830         // done on purpose, i.e. the function accepts any input in this case
1831         // and although I'd prefer to return error on ill-formed output, our
1832         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1833         // explicitly ill-formed according to RFC 2152) neither so we don't
1834         // even have any fallback here...
1835         //
1836         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1837         // Win XP or newer and if it is specified on older versions, conversion
1838         // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1839         // fails. So we can only use the flag on newer Windows versions.
1840         // Additionally, the flag is not supported by UTF7, symbol and CJK
1841         // encodings. See here:
1842         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1843         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1844         int flags = 0;
1845         if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1846              m_CodePage < 50000 &&
1847              IsAtLeastWin2kSP4() )
1848         {
1849             flags = MB_ERR_INVALID_CHARS;
1850         }
1851         else if ( m_CodePage == CP_UTF8 )
1852         {
1853             // Avoid round-trip in the special case of UTF-8 by using our
1854             // own UTF-8 conversion code:
1855             return wxMBConvUTF8().MB2WC(buf, psz, n);
1856         }
1857
1858         const size_t len = ::MultiByteToWideChar
1859                              (
1860                                 m_CodePage,     // code page
1861                                 flags,          // flags: fall on error
1862                                 psz,            // input string
1863                                 -1,             // its length (NUL-terminated)
1864                                 buf,            // output string
1865                                 buf ? n : 0     // size of output buffer
1866                              );
1867         if ( !len )
1868         {
1869             // function totally failed
1870             return (size_t)-1;
1871         }
1872
1873         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1874         // check if we succeeded, by doing a double trip:
1875         if ( !flags && buf )
1876         {
1877             const size_t mbLen = strlen(psz);
1878             wxCharBuffer mbBuf(mbLen);
1879             if ( ::WideCharToMultiByte
1880                    (
1881                       m_CodePage,
1882                       0,
1883                       buf,
1884                       -1,
1885                       mbBuf.data(),
1886                       mbLen + 1,        // size in bytes, not length
1887                       NULL,
1888                       NULL
1889                    ) == 0 ||
1890                   strcmp(mbBuf, psz) != 0 )
1891             {
1892                 // we didn't obtain the same thing we started from, hence
1893                 // the conversion was lossy and we consider that it failed
1894                 return (size_t)-1;
1895             }
1896         }
1897
1898         // note that it returns count of written chars for buf != NULL and size
1899         // of the needed buffer for buf == NULL so in either case the length of
1900         // the string (which never includes the terminating NUL) is one less
1901         return len - 1;
1902     }
1903
1904     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1905     {
1906         /*
1907             we have a problem here: by default, WideCharToMultiByte() may
1908             replace characters unrepresentable in the target code page with bad
1909             quality approximations such as turning "1/2" symbol (U+00BD) into
1910             "1" for the code pages which don't have it and we, obviously, want
1911             to avoid this at any price
1912
1913             the trouble is that this function does it _silently_, i.e. it won't
1914             even tell us whether it did or not... Win98/2000 and higher provide
1915             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1916             we have to resort to a round trip, i.e. check that converting back
1917             results in the same string -- this is, of course, expensive but
1918             otherwise we simply can't be sure to not garble the data.
1919          */
1920
1921         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1922         // it doesn't work with CJK encodings (which we test for rather roughly
1923         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1924         // supporting it
1925         BOOL usedDef wxDUMMY_INITIALIZE(false);
1926         BOOL *pUsedDef;
1927         int flags;
1928         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1929         {
1930             // it's our lucky day
1931             flags = WC_NO_BEST_FIT_CHARS;
1932             pUsedDef = &usedDef;
1933         }
1934         else // old system or unsupported encoding
1935         {
1936             flags = 0;
1937             pUsedDef = NULL;
1938         }
1939
1940         const size_t len = ::WideCharToMultiByte
1941                              (
1942                                 m_CodePage,     // code page
1943                                 flags,          // either none or no best fit
1944                                 pwz,            // input string
1945                                 -1,             // it is (wide) NUL-terminated
1946                                 buf,            // output buffer
1947                                 buf ? n : 0,    // and its size
1948                                 NULL,           // default "replacement" char
1949                                 pUsedDef        // [out] was it used?
1950                              );
1951
1952         if ( !len )
1953         {
1954             // function totally failed
1955             return (size_t)-1;
1956         }
1957
1958         // if we were really converting, check if we succeeded
1959         if ( buf )
1960         {
1961             if ( flags )
1962             {
1963                 // check if the conversion failed, i.e. if any replacements
1964                 // were done
1965                 if ( usedDef )
1966                     return (size_t)-1;
1967             }
1968             else // we must resort to double tripping...
1969             {
1970                 wxWCharBuffer wcBuf(n);
1971                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1972                         wcscmp(wcBuf, pwz) != 0 )
1973                 {
1974                     // we didn't obtain the same thing we started from, hence
1975                     // the conversion was lossy and we consider that it failed
1976                     return (size_t)-1;
1977                 }
1978             }
1979         }
1980
1981         // see the comment above for the reason of "len - 1"
1982         return len - 1;
1983     }
1984
1985     virtual size_t GetMBNulLen() const
1986     {
1987         if ( m_minMBCharWidth == 0 )
1988         {
1989             int len = ::WideCharToMultiByte
1990                         (
1991                             m_CodePage,     // code page
1992                             0,              // no flags
1993                             L"",            // input string
1994                             1,              // translate just the NUL
1995                             NULL,           // output buffer
1996                             0,              // and its size
1997                             NULL,           // no replacement char
1998                             NULL            // [out] don't care if it was used
1999                         );
2000
2001             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2002             switch ( len )
2003             {
2004                 default:
2005                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2006                     // fall through
2007
2008                 case 0:
2009                     self->m_minMBCharWidth = (size_t)-1;
2010                     break;
2011
2012                 case 1:
2013                 case 2:
2014                 case 4:
2015                     self->m_minMBCharWidth = len;
2016                     break;
2017             }
2018         }
2019
2020         return m_minMBCharWidth;
2021     }
2022
2023     bool IsOk() const { return m_CodePage != -1; }
2024
2025 private:
2026     static bool CanUseNoBestFit()
2027     {
2028         static int s_isWin98Or2k = -1;
2029
2030         if ( s_isWin98Or2k == -1 )
2031         {
2032             int verMaj, verMin;
2033             switch ( wxGetOsVersion(&verMaj, &verMin) )
2034             {
2035                 case wxWIN95:
2036                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2037                     break;
2038
2039                 case wxWINDOWS_NT:
2040                     s_isWin98Or2k = verMaj >= 5;
2041                     break;
2042
2043                 default:
2044                     // unknown, be conseravtive by default
2045                     s_isWin98Or2k = 0;
2046             }
2047
2048             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2049         }
2050
2051         return s_isWin98Or2k == 1;
2052     }
2053
2054     static bool IsAtLeastWin2kSP4()
2055     {
2056 #ifdef __WXWINCE__
2057         return false;
2058 #else
2059         static int s_isAtLeastWin2kSP4 = -1;
2060
2061         if ( s_isAtLeastWin2kSP4 == -1 )
2062         {
2063             OSVERSIONINFOEX ver;
2064
2065             memset(&ver, 0, sizeof(ver));
2066             ver.dwOSVersionInfoSize = sizeof(ver);
2067             GetVersionEx((OSVERSIONINFO*)&ver);
2068
2069             s_isAtLeastWin2kSP4 =
2070               ((ver.dwMajorVersion > 5) || // Vista+
2071                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2072                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2073                ver.wServicePackMajor >= 4)) // 2000 SP4+
2074               ? 1 : 0;
2075         }
2076
2077         return s_isAtLeastWin2kSP4 == 1;
2078 #endif
2079     }
2080
2081
2082     // the code page we're working with
2083     long m_CodePage;
2084
2085     // cached result of GetMBNulLen(), set to 0 initially meaning
2086     // "unknown"
2087     size_t m_minMBCharWidth;
2088 };
2089
2090 #endif // wxHAVE_WIN32_MB2WC
2091
2092 // ============================================================================
2093 // Cocoa conversion classes
2094 // ============================================================================
2095
2096 #if defined(__WXCOCOA__)
2097
2098 // RN:  There is no UTF-32 support in either Core Foundation or
2099 // Cocoa.  Strangely enough, internally Core Foundation uses
2100 // UTF 32 internally quite a bit - its just not public (yet).
2101
2102 #include <CoreFoundation/CFString.h>
2103 #include <CoreFoundation/CFStringEncodingExt.h>
2104
2105 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2106 {
2107     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2108     if ( encoding == wxFONTENCODING_DEFAULT )
2109     {
2110         enc = CFStringGetSystemEncoding();
2111     }
2112     else switch( encoding)
2113     {
2114         case wxFONTENCODING_ISO8859_1 :
2115             enc = kCFStringEncodingISOLatin1 ;
2116             break ;
2117         case wxFONTENCODING_ISO8859_2 :
2118             enc = kCFStringEncodingISOLatin2;
2119             break ;
2120         case wxFONTENCODING_ISO8859_3 :
2121             enc = kCFStringEncodingISOLatin3 ;
2122             break ;
2123         case wxFONTENCODING_ISO8859_4 :
2124             enc = kCFStringEncodingISOLatin4;
2125             break ;
2126         case wxFONTENCODING_ISO8859_5 :
2127             enc = kCFStringEncodingISOLatinCyrillic;
2128             break ;
2129         case wxFONTENCODING_ISO8859_6 :
2130             enc = kCFStringEncodingISOLatinArabic;
2131             break ;
2132         case wxFONTENCODING_ISO8859_7 :
2133             enc = kCFStringEncodingISOLatinGreek;
2134             break ;
2135         case wxFONTENCODING_ISO8859_8 :
2136             enc = kCFStringEncodingISOLatinHebrew;
2137             break ;
2138         case wxFONTENCODING_ISO8859_9 :
2139             enc = kCFStringEncodingISOLatin5;
2140             break ;
2141         case wxFONTENCODING_ISO8859_10 :
2142             enc = kCFStringEncodingISOLatin6;
2143             break ;
2144         case wxFONTENCODING_ISO8859_11 :
2145             enc = kCFStringEncodingISOLatinThai;
2146             break ;
2147         case wxFONTENCODING_ISO8859_13 :
2148             enc = kCFStringEncodingISOLatin7;
2149             break ;
2150         case wxFONTENCODING_ISO8859_14 :
2151             enc = kCFStringEncodingISOLatin8;
2152             break ;
2153         case wxFONTENCODING_ISO8859_15 :
2154             enc = kCFStringEncodingISOLatin9;
2155             break ;
2156
2157         case wxFONTENCODING_KOI8 :
2158             enc = kCFStringEncodingKOI8_R;
2159             break ;
2160         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2161             enc = kCFStringEncodingDOSRussian;
2162             break ;
2163
2164 //      case wxFONTENCODING_BULGARIAN :
2165 //          enc = ;
2166 //          break ;
2167
2168         case wxFONTENCODING_CP437 :
2169             enc =kCFStringEncodingDOSLatinUS ;
2170             break ;
2171         case wxFONTENCODING_CP850 :
2172             enc = kCFStringEncodingDOSLatin1;
2173             break ;
2174         case wxFONTENCODING_CP852 :
2175             enc = kCFStringEncodingDOSLatin2;
2176             break ;
2177         case wxFONTENCODING_CP855 :
2178             enc = kCFStringEncodingDOSCyrillic;
2179             break ;
2180         case wxFONTENCODING_CP866 :
2181             enc =kCFStringEncodingDOSRussian ;
2182             break ;
2183         case wxFONTENCODING_CP874 :
2184             enc = kCFStringEncodingDOSThai;
2185             break ;
2186         case wxFONTENCODING_CP932 :
2187             enc = kCFStringEncodingDOSJapanese;
2188             break ;
2189         case wxFONTENCODING_CP936 :
2190             enc =kCFStringEncodingDOSChineseSimplif ;
2191             break ;
2192         case wxFONTENCODING_CP949 :
2193             enc = kCFStringEncodingDOSKorean;
2194             break ;
2195         case wxFONTENCODING_CP950 :
2196             enc = kCFStringEncodingDOSChineseTrad;
2197             break ;
2198         case wxFONTENCODING_CP1250 :
2199             enc = kCFStringEncodingWindowsLatin2;
2200             break ;
2201         case wxFONTENCODING_CP1251 :
2202             enc =kCFStringEncodingWindowsCyrillic ;
2203             break ;
2204         case wxFONTENCODING_CP1252 :
2205             enc =kCFStringEncodingWindowsLatin1 ;
2206             break ;
2207         case wxFONTENCODING_CP1253 :
2208             enc = kCFStringEncodingWindowsGreek;
2209             break ;
2210         case wxFONTENCODING_CP1254 :
2211             enc = kCFStringEncodingWindowsLatin5;
2212             break ;
2213         case wxFONTENCODING_CP1255 :
2214             enc =kCFStringEncodingWindowsHebrew ;
2215             break ;
2216         case wxFONTENCODING_CP1256 :
2217             enc =kCFStringEncodingWindowsArabic ;
2218             break ;
2219         case wxFONTENCODING_CP1257 :
2220             enc = kCFStringEncodingWindowsBalticRim;
2221             break ;
2222 //   This only really encodes to UTF7 (if that) evidently
2223 //        case wxFONTENCODING_UTF7 :
2224 //            enc = kCFStringEncodingNonLossyASCII ;
2225 //            break ;
2226         case wxFONTENCODING_UTF8 :
2227             enc = kCFStringEncodingUTF8 ;
2228             break ;
2229         case wxFONTENCODING_EUC_JP :
2230             enc = kCFStringEncodingEUC_JP;
2231             break ;
2232         case wxFONTENCODING_UTF16 :
2233             enc = kCFStringEncodingUnicode ;
2234             break ;
2235         case wxFONTENCODING_MACROMAN :
2236             enc = kCFStringEncodingMacRoman ;
2237             break ;
2238         case wxFONTENCODING_MACJAPANESE :
2239             enc = kCFStringEncodingMacJapanese ;
2240             break ;
2241         case wxFONTENCODING_MACCHINESETRAD :
2242             enc = kCFStringEncodingMacChineseTrad ;
2243             break ;
2244         case wxFONTENCODING_MACKOREAN :
2245             enc = kCFStringEncodingMacKorean ;
2246             break ;
2247         case wxFONTENCODING_MACARABIC :
2248             enc = kCFStringEncodingMacArabic ;
2249             break ;
2250         case wxFONTENCODING_MACHEBREW :
2251             enc = kCFStringEncodingMacHebrew ;
2252             break ;
2253         case wxFONTENCODING_MACGREEK :
2254             enc = kCFStringEncodingMacGreek ;
2255             break ;
2256         case wxFONTENCODING_MACCYRILLIC :
2257             enc = kCFStringEncodingMacCyrillic ;
2258             break ;
2259         case wxFONTENCODING_MACDEVANAGARI :
2260             enc = kCFStringEncodingMacDevanagari ;
2261             break ;
2262         case wxFONTENCODING_MACGURMUKHI :
2263             enc = kCFStringEncodingMacGurmukhi ;
2264             break ;
2265         case wxFONTENCODING_MACGUJARATI :
2266             enc = kCFStringEncodingMacGujarati ;
2267             break ;
2268         case wxFONTENCODING_MACORIYA :
2269             enc = kCFStringEncodingMacOriya ;
2270             break ;
2271         case wxFONTENCODING_MACBENGALI :
2272             enc = kCFStringEncodingMacBengali ;
2273             break ;
2274         case wxFONTENCODING_MACTAMIL :
2275             enc = kCFStringEncodingMacTamil ;
2276             break ;
2277         case wxFONTENCODING_MACTELUGU :
2278             enc = kCFStringEncodingMacTelugu ;
2279             break ;
2280         case wxFONTENCODING_MACKANNADA :
2281             enc = kCFStringEncodingMacKannada ;
2282             break ;
2283         case wxFONTENCODING_MACMALAJALAM :
2284             enc = kCFStringEncodingMacMalayalam ;
2285             break ;
2286         case wxFONTENCODING_MACSINHALESE :
2287             enc = kCFStringEncodingMacSinhalese ;
2288             break ;
2289         case wxFONTENCODING_MACBURMESE :
2290             enc = kCFStringEncodingMacBurmese ;
2291             break ;
2292         case wxFONTENCODING_MACKHMER :
2293             enc = kCFStringEncodingMacKhmer ;
2294             break ;
2295         case wxFONTENCODING_MACTHAI :
2296             enc = kCFStringEncodingMacThai ;
2297             break ;
2298         case wxFONTENCODING_MACLAOTIAN :
2299             enc = kCFStringEncodingMacLaotian ;
2300             break ;
2301         case wxFONTENCODING_MACGEORGIAN :
2302             enc = kCFStringEncodingMacGeorgian ;
2303             break ;
2304         case wxFONTENCODING_MACARMENIAN :
2305             enc = kCFStringEncodingMacArmenian ;
2306             break ;
2307         case wxFONTENCODING_MACCHINESESIMP :
2308             enc = kCFStringEncodingMacChineseSimp ;
2309             break ;
2310         case wxFONTENCODING_MACTIBETAN :
2311             enc = kCFStringEncodingMacTibetan ;
2312             break ;
2313         case wxFONTENCODING_MACMONGOLIAN :
2314             enc = kCFStringEncodingMacMongolian ;
2315             break ;
2316         case wxFONTENCODING_MACETHIOPIC :
2317             enc = kCFStringEncodingMacEthiopic ;
2318             break ;
2319         case wxFONTENCODING_MACCENTRALEUR :
2320             enc = kCFStringEncodingMacCentralEurRoman ;
2321             break ;
2322         case wxFONTENCODING_MACVIATNAMESE :
2323             enc = kCFStringEncodingMacVietnamese ;
2324             break ;
2325         case wxFONTENCODING_MACARABICEXT :
2326             enc = kCFStringEncodingMacExtArabic ;
2327             break ;
2328         case wxFONTENCODING_MACSYMBOL :
2329             enc = kCFStringEncodingMacSymbol ;
2330             break ;
2331         case wxFONTENCODING_MACDINGBATS :
2332             enc = kCFStringEncodingMacDingbats ;
2333             break ;
2334         case wxFONTENCODING_MACTURKISH :
2335             enc = kCFStringEncodingMacTurkish ;
2336             break ;
2337         case wxFONTENCODING_MACCROATIAN :
2338             enc = kCFStringEncodingMacCroatian ;
2339             break ;
2340         case wxFONTENCODING_MACICELANDIC :
2341             enc = kCFStringEncodingMacIcelandic ;
2342             break ;
2343         case wxFONTENCODING_MACROMANIAN :
2344             enc = kCFStringEncodingMacRomanian ;
2345             break ;
2346         case wxFONTENCODING_MACCELTIC :
2347             enc = kCFStringEncodingMacCeltic ;
2348             break ;
2349         case wxFONTENCODING_MACGAELIC :
2350             enc = kCFStringEncodingMacGaelic ;
2351             break ;
2352 //      case wxFONTENCODING_MACKEYBOARD :
2353 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2354 //          break ;
2355         default :
2356             // because gcc is picky
2357             break ;
2358     } ;
2359     return enc ;
2360 }
2361
2362 class wxMBConv_cocoa : public wxMBConv
2363 {
2364 public:
2365     wxMBConv_cocoa()
2366     {
2367         Init(CFStringGetSystemEncoding()) ;
2368     }
2369
2370 #if wxUSE_FONTMAP
2371     wxMBConv_cocoa(const wxChar* name)
2372     {
2373         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2374     }
2375 #endif
2376
2377     wxMBConv_cocoa(wxFontEncoding encoding)
2378     {
2379         Init( wxCFStringEncFromFontEnc(encoding) );
2380     }
2381
2382     ~wxMBConv_cocoa()
2383     {
2384     }
2385
2386     void Init( CFStringEncoding encoding)
2387     {
2388         m_encoding = encoding ;
2389     }
2390
2391     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2392     {
2393         wxASSERT(szUnConv);
2394
2395         CFStringRef theString = CFStringCreateWithBytes (
2396                                                 NULL, //the allocator
2397                                                 (const UInt8*)szUnConv,
2398                                                 strlen(szUnConv),
2399                                                 m_encoding,
2400                                                 false //no BOM/external representation
2401                                                 );
2402
2403         wxASSERT(theString);
2404
2405         size_t nOutLength = CFStringGetLength(theString);
2406
2407         if (szOut == NULL)
2408         {
2409             CFRelease(theString);
2410             return nOutLength;
2411         }
2412
2413         CFRange theRange = { 0, nOutSize };
2414
2415 #if SIZEOF_WCHAR_T == 4
2416         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2417 #endif
2418
2419         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2420
2421         CFRelease(theString);
2422
2423         szUniCharBuffer[nOutLength] = '\0' ;
2424
2425 #if SIZEOF_WCHAR_T == 4
2426         wxMBConvUTF16 converter ;
2427         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2428         delete[] szUniCharBuffer;
2429 #endif
2430
2431         return nOutLength;
2432     }
2433
2434     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2435     {
2436         wxASSERT(szUnConv);
2437
2438         size_t nRealOutSize;
2439         size_t nBufSize = wxWcslen(szUnConv);
2440         UniChar* szUniBuffer = (UniChar*) szUnConv;
2441
2442 #if SIZEOF_WCHAR_T == 4
2443         wxMBConvUTF16 converter ;
2444         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2445         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2446         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2447         nBufSize /= sizeof(UniChar);
2448 #endif
2449
2450         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2451                                 NULL, //allocator
2452                                 szUniBuffer,
2453                                 nBufSize,
2454                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2455                             );
2456
2457         wxASSERT(theString);
2458
2459         //Note that CER puts a BOM when converting to unicode
2460         //so we  check and use getchars instead in that case
2461         if (m_encoding == kCFStringEncodingUnicode)
2462         {
2463             if (szOut != NULL)
2464                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2465
2466             nRealOutSize = CFStringGetLength(theString) + 1;
2467         }
2468         else
2469         {
2470             CFStringGetBytes(
2471                 theString,
2472                 CFRangeMake(0, CFStringGetLength(theString)),
2473                 m_encoding,
2474                 0, //what to put in characters that can't be converted -
2475                     //0 tells CFString to return NULL if it meets such a character
2476                 false, //not an external representation
2477                 (UInt8*) szOut,
2478                 nOutSize,
2479                 (CFIndex*) &nRealOutSize
2480                         );
2481         }
2482
2483         CFRelease(theString);
2484
2485 #if SIZEOF_WCHAR_T == 4
2486         delete[] szUniBuffer;
2487 #endif
2488
2489         return  nRealOutSize - 1;
2490     }
2491
2492     bool IsOk() const
2493     {
2494         return m_encoding != kCFStringEncodingInvalidId &&
2495               CFStringIsEncodingAvailable(m_encoding);
2496     }
2497
2498 private:
2499     CFStringEncoding m_encoding ;
2500 };
2501
2502 #endif // defined(__WXCOCOA__)
2503
2504 // ============================================================================
2505 // Mac conversion classes
2506 // ============================================================================
2507
2508 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2509
2510 class wxMBConv_mac : public wxMBConv
2511 {
2512 public:
2513     wxMBConv_mac()
2514     {
2515         Init(CFStringGetSystemEncoding()) ;
2516     }
2517
2518 #if wxUSE_FONTMAP
2519     wxMBConv_mac(const wxChar* name)
2520     {
2521         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2522     }
2523 #endif
2524
2525     wxMBConv_mac(wxFontEncoding encoding)
2526     {
2527         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2528     }
2529
2530     ~wxMBConv_mac()
2531     {
2532         OSStatus status = noErr ;
2533         status = TECDisposeConverter(m_MB2WC_converter);
2534         status = TECDisposeConverter(m_WC2MB_converter);
2535     }
2536
2537
2538     void Init( TextEncodingBase encoding)
2539     {
2540         OSStatus status = noErr ;
2541         m_char_encoding = encoding ;
2542         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2543
2544         status = TECCreateConverter(&m_MB2WC_converter,
2545                                     m_char_encoding,
2546                                     m_unicode_encoding);
2547         status = TECCreateConverter(&m_WC2MB_converter,
2548                                     m_unicode_encoding,
2549                                     m_char_encoding);
2550     }
2551
2552     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2553     {
2554         OSStatus status = noErr ;
2555         ByteCount byteOutLen ;
2556         ByteCount byteInLen = strlen(psz) ;
2557         wchar_t *tbuf = NULL ;
2558         UniChar* ubuf = NULL ;
2559         size_t res = 0 ;
2560
2561         if (buf == NULL)
2562         {
2563             //apple specs say at least 32
2564             n = wxMax( 32 , byteInLen ) ;
2565             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2566         }
2567         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2568 #if SIZEOF_WCHAR_T == 4
2569         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2570 #else
2571         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2572 #endif
2573         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2574           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2575 #if SIZEOF_WCHAR_T == 4
2576         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2577         // is not properly terminated we get random characters at the end
2578         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2579         wxMBConvUTF16 converter ;
2580         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2581         free( ubuf ) ;
2582 #else
2583         res = byteOutLen / sizeof( UniChar ) ;
2584 #endif
2585         if ( buf == NULL )
2586              free(tbuf) ;
2587
2588         if ( buf  && res < n)
2589             buf[res] = 0;
2590
2591         return res ;
2592     }
2593
2594     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2595     {
2596         OSStatus status = noErr ;
2597         ByteCount byteOutLen ;
2598         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2599
2600         char *tbuf = NULL ;
2601
2602         if (buf == NULL)
2603         {
2604             //apple specs say at least 32
2605             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2606             tbuf = (char*) malloc( n ) ;
2607         }
2608
2609         ByteCount byteBufferLen = n ;
2610         UniChar* ubuf = NULL ;
2611 #if SIZEOF_WCHAR_T == 4
2612         wxMBConvUTF16 converter ;
2613         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2614         byteInLen = unicharlen ;
2615         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2616         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2617 #else
2618         ubuf = (UniChar*) psz ;
2619 #endif
2620         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2621             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2622 #if SIZEOF_WCHAR_T == 4
2623         free( ubuf ) ;
2624 #endif
2625         if ( buf == NULL )
2626             free(tbuf) ;
2627
2628         size_t res = byteOutLen ;
2629         if ( buf  && res < n)
2630         {
2631             buf[res] = 0;
2632
2633             //we need to double-trip to verify it didn't insert any ? in place
2634             //of bogus characters
2635             wxWCharBuffer wcBuf(n);
2636             size_t pszlen = wxWcslen(psz);
2637             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2638                         wxWcslen(wcBuf) != pszlen ||
2639                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2640             {
2641                 // we didn't obtain the same thing we started from, hence
2642                 // the conversion was lossy and we consider that it failed
2643                 return (size_t)-1;
2644             }
2645         }
2646
2647         return res ;
2648     }
2649
2650     bool IsOk() const
2651         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2652
2653 private:
2654     TECObjectRef m_MB2WC_converter ;
2655     TECObjectRef m_WC2MB_converter ;
2656
2657     TextEncodingBase m_char_encoding ;
2658     TextEncodingBase m_unicode_encoding ;
2659 };
2660
2661 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2662
2663 // ============================================================================
2664 // wxEncodingConverter based conversion classes
2665 // ============================================================================
2666
2667 #if wxUSE_FONTMAP
2668
2669 class wxMBConv_wxwin : public wxMBConv
2670 {
2671 private:
2672     void Init()
2673     {
2674         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2675                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2676     }
2677
2678 public:
2679     // temporarily just use wxEncodingConverter stuff,
2680     // so that it works while a better implementation is built
2681     wxMBConv_wxwin(const wxChar* name)
2682     {
2683         if (name)
2684             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2685         else
2686             m_enc = wxFONTENCODING_SYSTEM;
2687
2688         Init();
2689     }
2690
2691     wxMBConv_wxwin(wxFontEncoding enc)
2692     {
2693         m_enc = enc;
2694
2695         Init();
2696     }
2697
2698     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2699     {
2700         size_t inbuf = strlen(psz);
2701         if (buf)
2702         {
2703             if (!m2w.Convert(psz,buf))
2704                 return (size_t)-1;
2705         }
2706         return inbuf;
2707     }
2708
2709     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2710     {
2711         const size_t inbuf = wxWcslen(psz);
2712         if (buf)
2713         {
2714             if (!w2m.Convert(psz,buf))
2715                 return (size_t)-1;
2716         }
2717
2718         return inbuf;
2719     }
2720
2721     virtual size_t GetMBNulLen() const
2722     {
2723         switch ( m_enc )
2724         {
2725             case wxFONTENCODING_UTF16BE:
2726             case wxFONTENCODING_UTF16LE:
2727                 return 2;
2728
2729             case wxFONTENCODING_UTF32BE:
2730             case wxFONTENCODING_UTF32LE:
2731                 return 4;
2732
2733             default:
2734                 return 1;
2735         }
2736     }
2737
2738     bool IsOk() const { return m_ok; }
2739
2740 public:
2741     wxFontEncoding m_enc;
2742     wxEncodingConverter m2w, w2m;
2743
2744 private:
2745     // were we initialized successfully?
2746     bool m_ok;
2747
2748     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2749 };
2750
2751 // make the constructors available for unit testing
2752 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2753 {
2754     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2755     if ( !result->IsOk() )
2756     {
2757         delete result;
2758         return 0;
2759     }
2760     return result;
2761 }
2762
2763 #endif // wxUSE_FONTMAP
2764
2765 // ============================================================================
2766 // wxCSConv implementation
2767 // ============================================================================
2768
2769 void wxCSConv::Init()
2770 {
2771     m_name = NULL;
2772     m_convReal =  NULL;
2773     m_deferred = true;
2774 }
2775
2776 wxCSConv::wxCSConv(const wxChar *charset)
2777 {
2778     Init();
2779
2780     if ( charset )
2781     {
2782         SetName(charset);
2783     }
2784
2785 #if wxUSE_FONTMAP
2786     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2787 #else
2788     m_encoding = wxFONTENCODING_SYSTEM;
2789 #endif
2790 }
2791
2792 wxCSConv::wxCSConv(wxFontEncoding encoding)
2793 {
2794     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2795     {
2796         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2797
2798         encoding = wxFONTENCODING_SYSTEM;
2799     }
2800
2801     Init();
2802
2803     m_encoding = encoding;
2804 }
2805
2806 wxCSConv::~wxCSConv()
2807 {
2808     Clear();
2809 }
2810
2811 wxCSConv::wxCSConv(const wxCSConv& conv)
2812         : wxMBConv()
2813 {
2814     Init();
2815
2816     SetName(conv.m_name);
2817     m_encoding = conv.m_encoding;
2818 }
2819
2820 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2821 {
2822     Clear();
2823
2824     SetName(conv.m_name);
2825     m_encoding = conv.m_encoding;
2826
2827     return *this;
2828 }
2829
2830 void wxCSConv::Clear()
2831 {
2832     free(m_name);
2833     delete m_convReal;
2834
2835     m_name = NULL;
2836     m_convReal = NULL;
2837 }
2838
2839 void wxCSConv::SetName(const wxChar *charset)
2840 {
2841     if (charset)
2842     {
2843         m_name = wxStrdup(charset);
2844         m_deferred = true;
2845     }
2846 }
2847
2848 #if wxUSE_FONTMAP
2849 #include "wx/hashmap.h"
2850
2851 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2852                      wxEncodingNameCache );
2853
2854 static wxEncodingNameCache gs_nameCache;
2855 #endif
2856
2857 wxMBConv *wxCSConv::DoCreate() const
2858 {
2859 #if wxUSE_FONTMAP
2860     wxLogTrace(TRACE_STRCONV,
2861                wxT("creating conversion for %s"),
2862                (m_name ? m_name
2863                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2864 #endif // wxUSE_FONTMAP
2865
2866     // check for the special case of ASCII or ISO8859-1 charset: as we have
2867     // special knowledge of it anyhow, we don't need to create a special
2868     // conversion object
2869     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2870             m_encoding == wxFONTENCODING_DEFAULT )
2871     {
2872         // don't convert at all
2873         return NULL;
2874     }
2875
2876     // we trust OS to do conversion better than we can so try external
2877     // conversion methods first
2878     //
2879     // the full order is:
2880     //      1. OS conversion (iconv() under Unix or Win32 API)
2881     //      2. hard coded conversions for UTF
2882     //      3. wxEncodingConverter as fall back
2883
2884     // step (1)
2885 #ifdef HAVE_ICONV
2886 #if !wxUSE_FONTMAP
2887     if ( m_name )
2888 #endif // !wxUSE_FONTMAP
2889     {
2890         wxString name(m_name);
2891         wxFontEncoding encoding(m_encoding);
2892
2893         if ( !name.empty() )
2894         {
2895             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2896             if ( conv->IsOk() )
2897                 return conv;
2898
2899             delete conv;
2900
2901 #if wxUSE_FONTMAP
2902             encoding =
2903                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2904 #endif // wxUSE_FONTMAP
2905         }
2906 #if wxUSE_FONTMAP
2907         {
2908             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2909             if ( it != gs_nameCache.end() )
2910             {
2911                 if ( it->second.empty() )
2912                     return NULL;
2913
2914                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2915                 if ( conv->IsOk() )
2916                     return conv;
2917
2918                 delete conv;
2919             }
2920
2921             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2922
2923             for ( ; *names; ++names )
2924             {
2925                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2926                 if ( conv->IsOk() )
2927                 {
2928                     gs_nameCache[encoding] = *names;
2929                     return conv;
2930                 }
2931
2932                 delete conv;
2933             }
2934
2935             gs_nameCache[encoding] = _T(""); // cache the failure
2936         }
2937 #endif // wxUSE_FONTMAP
2938     }
2939 #endif // HAVE_ICONV
2940
2941 #ifdef wxHAVE_WIN32_MB2WC
2942     {
2943 #if wxUSE_FONTMAP
2944         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2945                                       : new wxMBConv_win32(m_encoding);
2946         if ( conv->IsOk() )
2947             return conv;
2948
2949         delete conv;
2950 #else
2951         return NULL;
2952 #endif
2953     }
2954 #endif // wxHAVE_WIN32_MB2WC
2955 #if defined(__WXMAC__)
2956     {
2957         // leave UTF16 and UTF32 to the built-ins of wx
2958         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2959             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2960         {
2961
2962 #if wxUSE_FONTMAP
2963             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2964                                         : new wxMBConv_mac(m_encoding);
2965 #else
2966             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2967 #endif
2968             if ( conv->IsOk() )
2969                  return conv;
2970
2971             delete conv;
2972         }
2973     }
2974 #endif
2975 #if defined(__WXCOCOA__)
2976     {
2977         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2978         {
2979
2980 #if wxUSE_FONTMAP
2981             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2982                                           : new wxMBConv_cocoa(m_encoding);
2983 #else
2984             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2985 #endif
2986             if ( conv->IsOk() )
2987                  return conv;
2988
2989             delete conv;
2990         }
2991     }
2992 #endif
2993     // step (2)
2994     wxFontEncoding enc = m_encoding;
2995 #if wxUSE_FONTMAP
2996     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2997     {
2998         // use "false" to suppress interactive dialogs -- we can be called from
2999         // anywhere and popping up a dialog from here is the last thing we want to
3000         // do
3001         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3002     }
3003 #endif // wxUSE_FONTMAP
3004
3005     switch ( enc )
3006     {
3007         case wxFONTENCODING_UTF7:
3008              return new wxMBConvUTF7;
3009
3010         case wxFONTENCODING_UTF8:
3011              return new wxMBConvUTF8;
3012
3013         case wxFONTENCODING_UTF16BE:
3014              return new wxMBConvUTF16BE;
3015
3016         case wxFONTENCODING_UTF16LE:
3017              return new wxMBConvUTF16LE;
3018
3019         case wxFONTENCODING_UTF32BE:
3020              return new wxMBConvUTF32BE;
3021
3022         case wxFONTENCODING_UTF32LE:
3023              return new wxMBConvUTF32LE;
3024
3025         default:
3026              // nothing to do but put here to suppress gcc warnings
3027              ;
3028     }
3029
3030     // step (3)
3031 #if wxUSE_FONTMAP
3032     {
3033         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3034                                       : new wxMBConv_wxwin(m_encoding);
3035         if ( conv->IsOk() )
3036             return conv;
3037
3038         delete conv;
3039     }
3040 #endif // wxUSE_FONTMAP
3041
3042     // NB: This is a hack to prevent deadlock. What could otherwise happen
3043     //     in Unicode build: wxConvLocal creation ends up being here
3044     //     because of some failure and logs the error. But wxLog will try to
3045     //     attach timestamp, for which it will need wxConvLocal (to convert
3046     //     time to char* and then wchar_t*), but that fails, tries to log
3047     //     error, but wxLog has a (already locked) critical section that
3048     //     guards static buffer.
3049     static bool alreadyLoggingError = false;
3050     if (!alreadyLoggingError)
3051     {
3052         alreadyLoggingError = true;
3053         wxLogError(_("Cannot convert from the charset '%s'!"),
3054                    m_name ? m_name
3055                       :
3056 #if wxUSE_FONTMAP
3057                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3058 #else // !wxUSE_FONTMAP
3059                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3060 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3061               );
3062         alreadyLoggingError = false;
3063     }
3064
3065     return NULL;
3066 }
3067
3068 void wxCSConv::CreateConvIfNeeded() const
3069 {
3070     if ( m_deferred )
3071     {
3072         wxCSConv *self = (wxCSConv *)this; // const_cast
3073
3074 #if wxUSE_INTL
3075         // if we don't have neither the name nor the encoding, use the default
3076         // encoding for this system
3077         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3078         {
3079             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3080         }
3081 #endif // wxUSE_INTL
3082
3083         self->m_convReal = DoCreate();
3084         self->m_deferred = false;
3085     }
3086 }
3087
3088 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3089 {
3090     CreateConvIfNeeded();
3091
3092     if (m_convReal)
3093         return m_convReal->MB2WC(buf, psz, n);
3094
3095     // latin-1 (direct)
3096     size_t len = strlen(psz);
3097
3098     if (buf)
3099     {
3100         for (size_t c = 0; c <= len; c++)
3101             buf[c] = (unsigned char)(psz[c]);
3102     }
3103
3104     return len;
3105 }
3106
3107 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3108 {
3109     CreateConvIfNeeded();
3110
3111     if (m_convReal)
3112         return m_convReal->WC2MB(buf, psz, n);
3113
3114     // latin-1 (direct)
3115     const size_t len = wxWcslen(psz);
3116     if (buf)
3117     {
3118         for (size_t c = 0; c <= len; c++)
3119         {
3120             if (psz[c] > 0xFF)
3121                 return (size_t)-1;
3122             buf[c] = (char)psz[c];
3123         }
3124     }
3125     else
3126     {
3127         for (size_t c = 0; c <= len; c++)
3128         {
3129             if (psz[c] > 0xFF)
3130                 return (size_t)-1;
3131         }
3132     }
3133
3134     return len;
3135 }
3136
3137 size_t wxCSConv::GetMBNulLen() const
3138 {
3139     CreateConvIfNeeded();
3140
3141     if ( m_convReal )
3142     {
3143         return m_convReal->GetMBNulLen();
3144     }
3145
3146     return 1;
3147 }
3148
3149 // ----------------------------------------------------------------------------
3150 // globals
3151 // ----------------------------------------------------------------------------
3152
3153 #ifdef __WINDOWS__
3154     static wxMBConv_win32 wxConvLibcObj;
3155 #elif defined(__WXMAC__) && !defined(__MACH__)
3156     static wxMBConv_mac wxConvLibcObj ;
3157 #else
3158     static wxMBConvLibc wxConvLibcObj;
3159 #endif
3160
3161 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3162 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3163 static wxMBConvUTF7 wxConvUTF7Obj;
3164 static wxMBConvUTF8 wxConvUTF8Obj;
3165
3166 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3167 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3168 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3169 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3170 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3171 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3172 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3173 #ifdef __WXOSX__
3174                                     wxConvUTF8Obj;
3175 #else
3176                                     wxConvLibcObj;
3177 #endif
3178
3179
3180 #else // !wxUSE_WCHAR_T
3181
3182 // stand-ins in absence of wchar_t
3183 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3184                                 wxConvISO8859_1,
3185                                 wxConvLocal,
3186                                 wxConvUTF8;
3187
3188 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T