src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  90 static bool NotAllNULs(const char *p, size_t n)
  91 {
  92     while ( n && *p++ == '\0' )
  93         n--;
  94
  95     return n != 0;
  96 }
  97
  98 // ----------------------------------------------------------------------------
  99 // UTF-16 en/decoding to/from UCS-4
 100 // ----------------------------------------------------------------------------
 101
 102
 103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 104 {
 105     if (input<=0xffff)
 106     {
 107         if (output)
 108             *output = (wxUint16) input;
 109         return 1;
 110     }
 111     else if (input>=0x110000)
 112     {
 113         return (size_t)-1;
 114     }
 115     else
 116     {
 117         if (output)
 118         {
 119             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 120             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 121         }
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input<0xd800) || (*input>0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 134     {
 135         output = *input;
 136         return (size_t)-1;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145
 146 // ----------------------------------------------------------------------------
 147 // wxMBConv
 148 // ----------------------------------------------------------------------------
 149
 150 size_t
 151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 152                   const char *src, size_t srcLen) const
 153 {
 154     // although new conversion classes are supposed to implement this function
 155     // directly, the existins ones only implement the old MB2WC() and so, to
 156     // avoid to have to rewrite all conversion classes at once, we provide a
 157     // default (but not efficient) implementation of this one in terms of the
 158     // old function by copying the input to ensure that it's NUL-terminated and
 159     // then using MB2WC() to convert it
 160
 161     // the number of chars [which would be] written to dst [if it were not NULL]
 162     size_t dstWritten = 0;
 163
 164     // the number of NULs terminating this string
 165     size_t nulLen wxDUMMY_INITIALIZE(0);
 166
 167     // if we were not given the input size we just have to assume that the
 168     // string is properly terminated as we have no way of knowing how long it
 169     // is anyhow, but if we do have the size check whether there are enough
 170     // NULs at the end
 171     wxCharBuffer bufTmp;
 172     const char *srcEnd;
 173     if ( srcLen != (size_t)-1 )
 174     {
 175         // we need to know how to find the end of this string
 176         nulLen = GetMBNulLen();
 177         if ( nulLen == wxCONV_FAILED )
 178             return wxCONV_FAILED;
 179
 180         // if there are enough NULs we can avoid the copy
 181         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 182         {
 183             // make a copy in order to properly NUL-terminate the string
 184             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 185             char * const p = bufTmp.data();
 186             memcpy(p, src, srcLen);
 187             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 188                 *s = '\0';
 189
 190             src = bufTmp;
 191         }
 192
 193         srcEnd = src + srcLen;
 194     }
 195     else // quit after the first loop iteration
 196     {
 197         srcEnd = NULL;
 198     }
 199
 200     for ( ;; )
 201     {
 202         // try to convert the current chunk
 203         size_t lenChunk = MB2WC(NULL, src, 0);
 204         if ( lenChunk == 0 )
 205         {
 206             // nothing left in the input string, conversion succeeded
 207             break;
 208         }
 209
 210         if ( lenChunk == wxCONV_FAILED )
 211             return wxCONV_FAILED;
 212
 213         // if we already have a previous chunk, leave the NUL separating it
 214         // from this one
 215         if ( dstWritten )
 216         {
 217             dstWritten++;
 218             if ( dst )
 219                 dst++;
 220         }
 221
 222         dstWritten += lenChunk;
 223
 224         if ( dst )
 225         {
 226             if ( dstWritten > dstLen )
 227                 return wxCONV_FAILED;
 228
 229             lenChunk = MB2WC(dst, src, lenChunk + 1 /* for NUL */);
 230             if ( lenChunk == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert the entire string in this cas, as we suppose that the
 239             // string is NUL-terminated and so srcEnd is not used at all
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == (size_t)-1 )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 wxMBConv::~wxMBConv()
 320 {
 321     // nothing to do here (necessary for Darwin linking probably)
 322 }
 323
 324 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 325 {
 326     if ( psz )
 327     {
 328         // calculate the length of the buffer needed first
 329         const size_t nLen = MB2WC(NULL, psz, 0);
 330         if ( nLen != wxCONV_FAILED )
 331         {
 332             // now do the actual conversion
 333             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 334
 335             // +1 for the trailing NULL
 336             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 337                 return buf;
 338         }
 339     }
 340
 341     return wxWCharBuffer();
 342 }
 343
 344 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 345 {
 346     if ( pwz )
 347     {
 348         const size_t nLen = WC2MB(NULL, pwz, 0);
 349         if ( nLen != wxCONV_FAILED )
 350         {
 351             // extra space for trailing NUL(s)
 352             static const size_t extraLen = GetMaxMBNulLen();
 353
 354             wxCharBuffer buf(nLen + extraLen - 1);
 355             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 356                 return buf;
 357         }
 358     }
 359
 360     return wxCharBuffer();
 361 }
 362
 363 const wxWCharBuffer
 364 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 365 {
 366     const size_t dstLen = ToWChar(NULL, 0, in, inLen);
 367     if ( dstLen != wxCONV_FAILED )
 368     {
 369         wxWCharBuffer wbuf(dstLen);
 370         if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
 371         {
 372             if ( outLen )
 373                 *outLen = dstLen;
 374             return wbuf;
 375         }
 376     }
 377
 378     if ( outLen )
 379         *outLen = 0;
 380
 381     return wxWCharBuffer();
 382 }
 383
 384 const wxCharBuffer
 385 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 386 {
 387     const size_t dstLen = FromWChar(NULL, 0, in, inLen);
 388     if ( dstLen != wxCONV_FAILED )
 389     {
 390         wxCharBuffer buf(dstLen);
 391         if ( FromWChar(buf.data(), dstLen, in, inLen) )
 392         {
 393             if ( outLen )
 394                 *outLen = dstLen;
 395             return buf;
 396         }
 397     }
 398
 399     if ( outLen )
 400         *outLen = 0;
 401
 402     return wxCharBuffer();
 403 }
 404
 405 // ----------------------------------------------------------------------------
 406 // wxMBConvLibc
 407 // ----------------------------------------------------------------------------
 408
 409 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 410 {
 411     return wxMB2WC(buf, psz, n);
 412 }
 413
 414 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 415 {
 416     return wxWC2MB(buf, psz, n);
 417 }
 418
 419 // ----------------------------------------------------------------------------
 420 // wxConvBrokenFileNames
 421 // ----------------------------------------------------------------------------
 422
 423 #ifdef __UNIX__
 424
 425 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 426 {
 427     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 428                   || wxStricmp(charset, _T("UTF8")) == 0  )
 429         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 430     else
 431         m_conv = new wxCSConv(charset);
 432 }
 433
 434 #endif // __UNIX__
 435
 436 // ----------------------------------------------------------------------------
 437 // UTF-7
 438 // ----------------------------------------------------------------------------
 439
 440 // Implementation (C) 2004 Fredrik Roubert
 441
 442 //
 443 // BASE64 decoding table
 444 //
 445 static const unsigned char utf7unb64[] =
 446 {
 447     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 448     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 449     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 450     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 451     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 452     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 453     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 454     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 455     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 456     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 457     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 458     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 459     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 460     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 461     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 462     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 463     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 464     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 465     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 466     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 467     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 468     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 469     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 470     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 471     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 472     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 473     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 474     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 475     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 476     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 477     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 478     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 479 };
 480
 481 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 482 {
 483     size_t len = 0;
 484
 485     while ( *psz && (!buf || (len < n)) )
 486     {
 487         unsigned char cc = *psz++;
 488         if (cc != '+')
 489         {
 490             // plain ASCII char
 491             if (buf)
 492                 *buf++ = cc;
 493             len++;
 494         }
 495         else if (*psz == '-')
 496         {
 497             // encoded plus sign
 498             if (buf)
 499                 *buf++ = cc;
 500             len++;
 501             psz++;
 502         }
 503         else // start of BASE64 encoded string
 504         {
 505             bool lsb, ok;
 506             unsigned int d, l;
 507             for ( ok = lsb = false, d = 0, l = 0;
 508                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 509                   psz++ )
 510             {
 511                 d <<= 6;
 512                 d += cc;
 513                 for (l += 6; l >= 8; lsb = !lsb)
 514                 {
 515                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 516                     if (lsb)
 517                     {
 518                         if (buf)
 519                             *buf++ |= c;
 520                         len ++;
 521                     }
 522                     else
 523                     {
 524                         if (buf)
 525                             *buf = (wchar_t)(c << 8);
 526                     }
 527
 528                     ok = true;
 529                 }
 530             }
 531
 532             if ( !ok )
 533             {
 534                 // in valid UTF7 we should have valid characters after '+'
 535                 return (size_t)-1;
 536             }
 537
 538             if (*psz == '-')
 539                 psz++;
 540         }
 541     }
 542
 543     if ( buf && (len < n) )
 544         *buf = '\0';
 545
 546     return len;
 547 }
 548
 549 //
 550 // BASE64 encoding table
 551 //
 552 static const unsigned char utf7enb64[] =
 553 {
 554     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 555     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 556     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 557     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 558     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 559     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 560     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 561     '4', '5', '6', '7', '8', '9', '+', '/'
 562 };
 563
 564 //
 565 // UTF-7 encoding table
 566 //
 567 // 0 - Set D (directly encoded characters)
 568 // 1 - Set O (optional direct characters)
 569 // 2 - whitespace characters (optional)
 570 // 3 - special characters
 571 //
 572 static const unsigned char utf7encode[128] =
 573 {
 574     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 575     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 576     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 577     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 578     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 579     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 580     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 581     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 582 };
 583
 584 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 585 {
 586     size_t len = 0;
 587
 588     while (*psz && ((!buf) || (len < n)))
 589     {
 590         wchar_t cc = *psz++;
 591         if (cc < 0x80 && utf7encode[cc] < 1)
 592         {
 593             // plain ASCII char
 594             if (buf)
 595                 *buf++ = (char)cc;
 596             len++;
 597         }
 598 #ifndef WC_UTF16
 599         else if (((wxUint32)cc) > 0xffff)
 600         {
 601             // no surrogate pair generation (yet?)
 602             return (size_t)-1;
 603         }
 604 #endif
 605         else
 606         {
 607             if (buf)
 608                 *buf++ = '+';
 609             len++;
 610             if (cc != '+')
 611             {
 612                 // BASE64 encode string
 613                 unsigned int lsb, d, l;
 614                 for (d = 0, l = 0; /*nothing*/; psz++)
 615                 {
 616                     for (lsb = 0; lsb < 2; lsb ++)
 617                     {
 618                         d <<= 8;
 619                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 620
 621                         for (l += 8; l >= 6; )
 622                         {
 623                             l -= 6;
 624                             if (buf)
 625                                 *buf++ = utf7enb64[(d >> l) % 64];
 626                             len++;
 627                         }
 628                     }
 629                     cc = *psz;
 630                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 631                         break;
 632                 }
 633                 if (l != 0)
 634                 {
 635                     if (buf)
 636                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 637                     len++;
 638                 }
 639             }
 640             if (buf)
 641                 *buf++ = '-';
 642             len++;
 643         }
 644     }
 645     if (buf && (len < n))
 646         *buf = 0;
 647     return len;
 648 }
 649
 650 // ----------------------------------------------------------------------------
 651 // UTF-8
 652 // ----------------------------------------------------------------------------
 653
 654 static wxUint32 utf8_max[]=
 655     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 656
 657 // boundaries of the private use area we use to (temporarily) remap invalid
 658 // characters invalid in a UTF-8 encoded string
 659 const wxUint32 wxUnicodePUA = 0x100000;
 660 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 661
 662 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 663 {
 664     size_t len = 0;
 665
 666     while (*psz && ((!buf) || (len < n)))
 667     {
 668         const char *opsz = psz;
 669         bool invalid = false;
 670         unsigned char cc = *psz++, fc = cc;
 671         unsigned cnt;
 672         for (cnt = 0; fc & 0x80; cnt++)
 673             fc <<= 1;
 674         if (!cnt)
 675         {
 676             // plain ASCII char
 677             if (buf)
 678                 *buf++ = cc;
 679             len++;
 680
 681             // escape the escape character for octal escapes
 682             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 683                     && cc == '\\' && (!buf || len < n))
 684             {
 685                 if (buf)
 686                     *buf++ = cc;
 687                 len++;
 688             }
 689         }
 690         else
 691         {
 692             cnt--;
 693             if (!cnt)
 694             {
 695                 // invalid UTF-8 sequence
 696                 invalid = true;
 697             }
 698             else
 699             {
 700                 unsigned ocnt = cnt - 1;
 701                 wxUint32 res = cc & (0x3f >> cnt);
 702                 while (cnt--)
 703                 {
 704                     cc = *psz;
 705                     if ((cc & 0xC0) != 0x80)
 706                     {
 707                         // invalid UTF-8 sequence
 708                         invalid = true;
 709                         break;
 710                     }
 711                     psz++;
 712                     res = (res << 6) | (cc & 0x3f);
 713                 }
 714                 if (invalid || res <= utf8_max[ocnt])
 715                 {
 716                     // illegal UTF-8 encoding
 717                     invalid = true;
 718                 }
 719                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 720                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 721                 {
 722                     // if one of our PUA characters turns up externally
 723                     // it must also be treated as an illegal sequence
 724                     // (a bit like you have to escape an escape character)
 725                     invalid = true;
 726                 }
 727                 else
 728                 {
 729 #ifdef WC_UTF16
 730                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 731                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 732                     if (pa == (size_t)-1)
 733                     {
 734                         invalid = true;
 735                     }
 736                     else
 737                     {
 738                         if (buf)
 739                             buf += pa;
 740                         len += pa;
 741                     }
 742 #else // !WC_UTF16
 743                     if (buf)
 744                         *buf++ = (wchar_t)res;
 745                     len++;
 746 #endif // WC_UTF16/!WC_UTF16
 747                 }
 748             }
 749             if (invalid)
 750             {
 751                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 752                 {
 753                     while (opsz < psz && (!buf || len < n))
 754                     {
 755 #ifdef WC_UTF16
 756                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 757                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 758                         wxASSERT(pa != (size_t)-1);
 759                         if (buf)
 760                             buf += pa;
 761                         opsz++;
 762                         len += pa;
 763 #else
 764                         if (buf)
 765                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 766                         opsz++;
 767                         len++;
 768 #endif
 769                     }
 770                 }
 771                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 772                 {
 773                     while (opsz < psz && (!buf || len < n))
 774                     {
 775                         if ( buf && len + 3 < n )
 776                         {
 777                             unsigned char on = *opsz;
 778                             *buf++ = L'\\';
 779                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 780                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 781                             *buf++ = (wchar_t)( L'0' + on % 010 );
 782                         }
 783                         opsz++;
 784                         len += 4;
 785                     }
 786                 }
 787                 else // MAP_INVALID_UTF8_NOT
 788                 {
 789                     return (size_t)-1;
 790                 }
 791             }
 792         }
 793     }
 794     if (buf && (len < n))
 795         *buf = 0;
 796     return len;
 797 }
 798
 799 static inline bool isoctal(wchar_t wch)
 800 {
 801     return L'0' <= wch && wch <= L'7';
 802 }
 803
 804 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 805 {
 806     size_t len = 0;
 807
 808     while (*psz && ((!buf) || (len < n)))
 809     {
 810         wxUint32 cc;
 811 #ifdef WC_UTF16
 812         // cast is ok for WC_UTF16
 813         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 814         psz += (pa == (size_t)-1) ? 1 : pa;
 815 #else
 816         cc=(*psz++) & 0x7fffffff;
 817 #endif
 818
 819         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 820                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 821         {
 822             if (buf)
 823                 *buf++ = (char)(cc - wxUnicodePUA);
 824             len++;
 825         }
 826         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 827                     && cc == L'\\' && psz[0] == L'\\' )
 828         {
 829             if (buf)
 830                 *buf++ = (char)cc;
 831             psz++;
 832             len++;
 833         }
 834         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 835                     cc == L'\\' &&
 836                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 837         {
 838             if (buf)
 839             {
 840                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 841                                  (psz[1] - L'0')*010 +
 842                                  (psz[2] - L'0'));
 843             }
 844
 845             psz += 3;
 846             len++;
 847         }
 848         else
 849         {
 850             unsigned cnt;
 851             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 852             if (!cnt)
 853             {
 854                 // plain ASCII char
 855                 if (buf)
 856                     *buf++ = (char) cc;
 857                 len++;
 858             }
 859
 860             else
 861             {
 862                 len += cnt + 1;
 863                 if (buf)
 864                 {
 865                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 866                     while (cnt--)
 867                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 868                 }
 869             }
 870         }
 871     }
 872
 873     if (buf && (len<n))
 874         *buf = 0;
 875
 876     return len;
 877 }
 878
 879 // ----------------------------------------------------------------------------
 880 // UTF-16
 881 // ----------------------------------------------------------------------------
 882
 883 #ifdef WORDS_BIGENDIAN
 884     #define wxMBConvUTF16straight wxMBConvUTF16BE
 885     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 886 #else
 887     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 888     #define wxMBConvUTF16straight wxMBConvUTF16LE
 889 #endif
 890
 891
 892 #ifdef WC_UTF16
 893
 894 // copy 16bit MB to 16bit String
 895 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 896 {
 897     size_t len=0;
 898
 899     while (*(wxUint16*)psz && (!buf || len < n))
 900     {
 901         if (buf)
 902             *buf++ = *(wxUint16*)psz;
 903         len++;
 904
 905         psz += sizeof(wxUint16);
 906     }
 907     if (buf && len<n)   *buf=0;
 908
 909     return len;
 910 }
 911
 912
 913 // copy 16bit String to 16bit MB
 914 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 915 {
 916     size_t len=0;
 917
 918     while (*psz && (!buf || len < n))
 919     {
 920         if (buf)
 921         {
 922             *(wxUint16*)buf = *psz;
 923             buf += sizeof(wxUint16);
 924         }
 925         len += sizeof(wxUint16);
 926         psz++;
 927     }
 928     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 929
 930     return len;
 931 }
 932
 933
 934 // swap 16bit MB to 16bit String
 935 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 936 {
 937     size_t len = 0;
 938
 939     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 940     // inside the string
 941     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 942     {
 943         if ( buf )
 944         {
 945             ((char *)buf)[0] = psz[1];
 946             ((char *)buf)[1] = psz[0];
 947             buf++;
 948         }
 949         len++;
 950         psz += 2;
 951     }
 952
 953     if ( buf && len < n )
 954         *buf = L'\0';
 955
 956     return len;
 957 }
 958
 959
 960 // swap 16bit MB to 16bit String
 961 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 962 {
 963     size_t len = 0;
 964
 965     while ( *psz && (!buf || len < n) )
 966     {
 967         if ( buf )
 968         {
 969             *buf++ = ((char*)psz)[1];
 970             *buf++ = ((char*)psz)[0];
 971         }
 972         len += 2;
 973         psz++;
 974     }
 975
 976     if ( buf && len < n )
 977         *buf = '\0';
 978
 979     return len;
 980 }
 981
 982
 983 #else // WC_UTF16
 984
 985
 986 // copy 16bit MB to 32bit String
 987 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 988 {
 989     size_t len=0;
 990
 991     while (*(wxUint16*)psz && (!buf || len < n))
 992     {
 993         wxUint32 cc;
 994         size_t pa=decode_utf16((wxUint16*)psz, cc);
 995         if (pa == (size_t)-1)
 996             return pa;
 997
 998         if (buf)
 999             *buf++ = (wchar_t)cc;
1000         len++;
1001         psz += pa * sizeof(wxUint16);
1002     }
1003     if (buf && len<n)   *buf=0;
1004
1005     return len;
1006 }
1007
1008
1009 // copy 32bit String to 16bit MB
1010 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1011 {
1012     size_t len=0;
1013
1014     while (*psz && (!buf || len < n))
1015     {
1016         wxUint16 cc[2];
1017         size_t pa=encode_utf16(*psz, cc);
1018
1019         if (pa == (size_t)-1)
1020             return pa;
1021
1022         if (buf)
1023         {
1024             *(wxUint16*)buf = cc[0];
1025             buf += sizeof(wxUint16);
1026             if (pa > 1)
1027             {
1028                 *(wxUint16*)buf = cc[1];
1029                 buf += sizeof(wxUint16);
1030             }
1031         }
1032
1033         len += pa*sizeof(wxUint16);
1034         psz++;
1035     }
1036     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1037
1038     return len;
1039 }
1040
1041
1042 // swap 16bit MB to 32bit String
1043 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1044 {
1045     size_t len=0;
1046
1047     while (*(wxUint16*)psz && (!buf || len < n))
1048     {
1049         wxUint32 cc;
1050         char tmp[4];
1051         tmp[0]=psz[1];  tmp[1]=psz[0];
1052         tmp[2]=psz[3];  tmp[3]=psz[2];
1053
1054         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1055         if (pa == (size_t)-1)
1056             return pa;
1057
1058         if (buf)
1059             *buf++ = (wchar_t)cc;
1060
1061         len++;
1062         psz += pa * sizeof(wxUint16);
1063     }
1064     if (buf && len<n)   *buf=0;
1065
1066     return len;
1067 }
1068
1069
1070 // swap 32bit String to 16bit MB
1071 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1072 {
1073     size_t len=0;
1074
1075     while (*psz && (!buf || len < n))
1076     {
1077         wxUint16 cc[2];
1078         size_t pa=encode_utf16(*psz, cc);
1079
1080         if (pa == (size_t)-1)
1081             return pa;
1082
1083         if (buf)
1084         {
1085             *buf++ = ((char*)cc)[1];
1086             *buf++ = ((char*)cc)[0];
1087             if (pa > 1)
1088             {
1089                 *buf++ = ((char*)cc)[3];
1090                 *buf++ = ((char*)cc)[2];
1091             }
1092         }
1093
1094         len += pa*sizeof(wxUint16);
1095         psz++;
1096     }
1097     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1098
1099     return len;
1100 }
1101
1102 #endif // WC_UTF16
1103
1104
1105 // ----------------------------------------------------------------------------
1106 // UTF-32
1107 // ----------------------------------------------------------------------------
1108
1109 #ifdef WORDS_BIGENDIAN
1110 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1111 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1112 #else
1113 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1114 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1115 #endif
1116
1117
1118 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1119 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1120
1121
1122 #ifdef WC_UTF16
1123
1124 // copy 32bit MB to 16bit String
1125 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1126 {
1127     size_t len=0;
1128
1129     while (*(wxUint32*)psz && (!buf || len < n))
1130     {
1131         wxUint16 cc[2];
1132
1133         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1134         if (pa == (size_t)-1)
1135             return pa;
1136
1137         if (buf)
1138         {
1139             *buf++ = cc[0];
1140             if (pa > 1)
1141                 *buf++ = cc[1];
1142         }
1143         len += pa;
1144         psz += sizeof(wxUint32);
1145     }
1146     if (buf && len<n)   *buf=0;
1147
1148     return len;
1149 }
1150
1151
1152 // copy 16bit String to 32bit MB
1153 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1154 {
1155     size_t len=0;
1156
1157     while (*psz && (!buf || len < n))
1158     {
1159         wxUint32 cc;
1160
1161         // cast is ok for WC_UTF16
1162         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1163         if (pa == (size_t)-1)
1164             return pa;
1165
1166         if (buf)
1167         {
1168             *(wxUint32*)buf = cc;
1169             buf += sizeof(wxUint32);
1170         }
1171         len += sizeof(wxUint32);
1172         psz += pa;
1173     }
1174
1175     if (buf && len<=n-sizeof(wxUint32))
1176         *(wxUint32*)buf=0;
1177
1178     return len;
1179 }
1180
1181
1182
1183 // swap 32bit MB to 16bit String
1184 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1185 {
1186     size_t len=0;
1187
1188     while (*(wxUint32*)psz && (!buf || len < n))
1189     {
1190         char tmp[4];
1191         tmp[0] = psz[3];   tmp[1] = psz[2];
1192         tmp[2] = psz[1];   tmp[3] = psz[0];
1193
1194
1195         wxUint16 cc[2];
1196
1197         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1198         if (pa == (size_t)-1)
1199             return pa;
1200
1201         if (buf)
1202         {
1203             *buf++ = cc[0];
1204             if (pa > 1)
1205                 *buf++ = cc[1];
1206         }
1207         len += pa;
1208         psz += sizeof(wxUint32);
1209     }
1210
1211     if (buf && len<n)
1212         *buf=0;
1213
1214     return len;
1215 }
1216
1217
1218 // swap 16bit String to 32bit MB
1219 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1220 {
1221     size_t len=0;
1222
1223     while (*psz && (!buf || len < n))
1224     {
1225         char cc[4];
1226
1227         // cast is ok for WC_UTF16
1228         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1229         if (pa == (size_t)-1)
1230             return pa;
1231
1232         if (buf)
1233         {
1234             *buf++ = cc[3];
1235             *buf++ = cc[2];
1236             *buf++ = cc[1];
1237             *buf++ = cc[0];
1238         }
1239         len += sizeof(wxUint32);
1240         psz += pa;
1241     }
1242
1243     if (buf && len<=n-sizeof(wxUint32))
1244         *(wxUint32*)buf=0;
1245
1246     return len;
1247 }
1248
1249 #else // WC_UTF16
1250
1251
1252 // copy 32bit MB to 32bit String
1253 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1254 {
1255     size_t len=0;
1256
1257     while (*(wxUint32*)psz && (!buf || len < n))
1258     {
1259         if (buf)
1260             *buf++ = (wchar_t)(*(wxUint32*)psz);
1261         len++;
1262         psz += sizeof(wxUint32);
1263     }
1264
1265     if (buf && len<n)
1266         *buf=0;
1267
1268     return len;
1269 }
1270
1271
1272 // copy 32bit String to 32bit MB
1273 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1274 {
1275     size_t len=0;
1276
1277     while (*psz && (!buf || len < n))
1278     {
1279         if (buf)
1280         {
1281             *(wxUint32*)buf = *psz;
1282             buf += sizeof(wxUint32);
1283         }
1284
1285         len += sizeof(wxUint32);
1286         psz++;
1287     }
1288
1289     if (buf && len<=n-sizeof(wxUint32))
1290         *(wxUint32*)buf=0;
1291
1292     return len;
1293 }
1294
1295
1296 // swap 32bit MB to 32bit String
1297 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1298 {
1299     size_t len=0;
1300
1301     while (*(wxUint32*)psz && (!buf || len < n))
1302     {
1303         if (buf)
1304         {
1305             ((char *)buf)[0] = psz[3];
1306             ((char *)buf)[1] = psz[2];
1307             ((char *)buf)[2] = psz[1];
1308             ((char *)buf)[3] = psz[0];
1309             buf++;
1310         }
1311         len++;
1312         psz += sizeof(wxUint32);
1313     }
1314
1315     if (buf && len<n)
1316         *buf=0;
1317
1318     return len;
1319 }
1320
1321
1322 // swap 32bit String to 32bit MB
1323 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1324 {
1325     size_t len=0;
1326
1327     while (*psz && (!buf || len < n))
1328     {
1329         if (buf)
1330         {
1331             *buf++ = ((char *)psz)[3];
1332             *buf++ = ((char *)psz)[2];
1333             *buf++ = ((char *)psz)[1];
1334             *buf++ = ((char *)psz)[0];
1335         }
1336         len += sizeof(wxUint32);
1337         psz++;
1338     }
1339
1340     if (buf && len<=n-sizeof(wxUint32))
1341         *(wxUint32*)buf=0;
1342
1343     return len;
1344 }
1345
1346
1347 #endif // WC_UTF16
1348
1349
1350 // ============================================================================
1351 // The classes doing conversion using the iconv_xxx() functions
1352 // ============================================================================
1353
1354 #ifdef HAVE_ICONV
1355
1356 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1357 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1358 //     (unless there's yet another bug in glibc) the only case when iconv()
1359 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1360 //     left in the input buffer -- when _real_ error occurs,
1361 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1362 //     iconv() failure.
1363 //     [This bug does not appear in glibc 2.2.]
1364 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1365 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1366                                      (errno != E2BIG || bufLeft != 0))
1367 #else
1368 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1369 #endif
1370
1371 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1372
1373 #define ICONV_T_INVALID ((iconv_t)-1)
1374
1375 #if SIZEOF_WCHAR_T == 4
1376     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1377     #define WC_ENC      wxFONTENCODING_UTF32
1378 #elif SIZEOF_WCHAR_T == 2
1379     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1380     #define WC_ENC      wxFONTENCODING_UTF16
1381 #else // sizeof(wchar_t) != 2 nor 4
1382     // does this ever happen?
1383     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1384 #endif
1385
1386 // ----------------------------------------------------------------------------
1387 // wxMBConv_iconv: encapsulates an iconv character set
1388 // ----------------------------------------------------------------------------
1389
1390 class wxMBConv_iconv : public wxMBConv
1391 {
1392 public:
1393     wxMBConv_iconv(const wxChar *name);
1394     virtual ~wxMBConv_iconv();
1395
1396     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1397     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1398
1399     // classify this encoding as explained in wxMBConv::GetMBNulLen()
1400     // comment
1401     virtual size_t GetMBNulLen() const;
1402
1403     bool IsOk() const
1404         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1405
1406 protected:
1407     // the iconv handlers used to translate from multibyte to wide char and in
1408     // the other direction
1409     iconv_t m2w,
1410             w2m;
1411 #if wxUSE_THREADS
1412     // guards access to m2w and w2m objects
1413     wxMutex m_iconvMutex;
1414 #endif
1415
1416 private:
1417     // the name (for iconv_open()) of a wide char charset -- if none is
1418     // available on this machine, it will remain NULL
1419     static wxString ms_wcCharsetName;
1420
1421     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1422     // different endian-ness than the native one
1423     static bool ms_wcNeedsSwap;
1424
1425     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1426     // initially
1427     size_t m_minMBCharWidth;
1428 };
1429
1430 // make the constructor available for unit testing
1431 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1432 {
1433     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1434     if ( !result->IsOk() )
1435     {
1436         delete result;
1437         return 0;
1438     }
1439     return result;
1440 }
1441
1442 wxString wxMBConv_iconv::ms_wcCharsetName;
1443 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1444
1445 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1446 {
1447     m_minMBCharWidth = 0;
1448
1449     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1450     // names for the charsets
1451     const wxCharBuffer cname(wxString(name).ToAscii());
1452
1453     // check for charset that represents wchar_t:
1454     if ( ms_wcCharsetName.empty() )
1455     {
1456         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1457
1458 #if wxUSE_FONTMAP
1459         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1460 #else // !wxUSE_FONTMAP
1461         static const wxChar *names[] =
1462         {
1463 #if SIZEOF_WCHAR_T == 4
1464             _T("UCS-4"),
1465 #elif SIZEOF_WCHAR_T = 2
1466             _T("UCS-2"),
1467 #endif
1468             NULL
1469         };
1470 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1471
1472         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1473         {
1474             const wxString nameCS(*names);
1475
1476             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1477             wxString nameXE(nameCS);
1478             #ifdef WORDS_BIGENDIAN
1479                 nameXE += _T("BE");
1480             #else // little endian
1481                 nameXE += _T("LE");
1482             #endif
1483
1484             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1485                        nameXE.c_str());
1486
1487             m2w = iconv_open(nameXE.ToAscii(), cname);
1488             if ( m2w == ICONV_T_INVALID )
1489             {
1490                 // try charset w/o bytesex info (e.g. "UCS4")
1491                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1492                            nameCS.c_str());
1493                 m2w = iconv_open(nameCS.ToAscii(), cname);
1494
1495                 // and check for bytesex ourselves:
1496                 if ( m2w != ICONV_T_INVALID )
1497                 {
1498                     char    buf[2], *bufPtr;
1499                     wchar_t wbuf[2], *wbufPtr;
1500                     size_t  insz, outsz;
1501                     size_t  res;
1502
1503                     buf[0] = 'A';
1504                     buf[1] = 0;
1505                     wbuf[0] = 0;
1506                     insz = 2;
1507                     outsz = SIZEOF_WCHAR_T * 2;
1508                     wbufPtr = wbuf;
1509                     bufPtr = buf;
1510
1511                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1512                                 (char**)&wbufPtr, &outsz);
1513
1514                     if (ICONV_FAILED(res, insz))
1515                     {
1516                         wxLogLastError(wxT("iconv"));
1517                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1518                                    nameCS.c_str());
1519                     }
1520                     else // ok, can convert to this encoding, remember it
1521                     {
1522                         ms_wcCharsetName = nameCS;
1523                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1524                     }
1525                 }
1526             }
1527             else // use charset not requiring byte swapping
1528             {
1529                 ms_wcCharsetName = nameXE;
1530             }
1531         }
1532
1533         wxLogTrace(TRACE_STRCONV,
1534                    wxT("iconv wchar_t charset is \"%s\"%s"),
1535                    ms_wcCharsetName.empty() ? _T("<none>")
1536                                             : ms_wcCharsetName.c_str(),
1537                    ms_wcNeedsSwap ? _T(" (needs swap)")
1538                                   : _T(""));
1539     }
1540     else // we already have ms_wcCharsetName
1541     {
1542         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1543     }
1544
1545     if ( ms_wcCharsetName.empty() )
1546     {
1547         w2m = ICONV_T_INVALID;
1548     }
1549     else
1550     {
1551         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1552         if ( w2m == ICONV_T_INVALID )
1553         {
1554             wxLogTrace(TRACE_STRCONV,
1555                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1556                        ms_wcCharsetName.c_str(), cname.data());
1557         }
1558     }
1559 }
1560
1561 wxMBConv_iconv::~wxMBConv_iconv()
1562 {
1563     if ( m2w != ICONV_T_INVALID )
1564         iconv_close(m2w);
1565     if ( w2m != ICONV_T_INVALID )
1566         iconv_close(w2m);
1567 }
1568
1569 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1570 {
1571     // find the string length: notice that must be done differently for
1572     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1573     size_t inbuf;
1574     const size_t nulLen = GetMBNulLen();
1575     switch ( nulLen )
1576     {
1577         default:
1578             return (size_t)-1;
1579
1580         case 1:
1581             inbuf = strlen(psz); // arguably more optimized than our version
1582             break;
1583
1584         case 2:
1585         case 4:
1586             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1587             // they also have to start at character boundary and not span two
1588             // adjacent characters
1589             const char *p;
1590             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1591                 ;
1592             inbuf = p - psz;
1593             break;
1594     }
1595
1596 #if wxUSE_THREADS
1597     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1598     //     Unfortunately there is a couple of global wxCSConv objects such as
1599     //     wxConvLocal that are used all over wx code, so we have to make sure
1600     //     the handle is used by at most one thread at the time. Otherwise
1601     //     only a few wx classes would be safe to use from non-main threads
1602     //     as MB<->WC conversion would fail "randomly".
1603     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1604 #endif // wxUSE_THREADS
1605
1606
1607     size_t outbuf = n * SIZEOF_WCHAR_T;
1608     size_t res, cres;
1609     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1610     wchar_t *bufPtr = buf;
1611     const char *pszPtr = psz;
1612
1613     if (buf)
1614     {
1615         // have destination buffer, convert there
1616         cres = iconv(m2w,
1617                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1618                      (char**)&bufPtr, &outbuf);
1619         res = n - (outbuf / SIZEOF_WCHAR_T);
1620
1621         if (ms_wcNeedsSwap)
1622         {
1623             // convert to native endianness
1624             for ( unsigned i = 0; i < res; i++ )
1625                 buf[n] = WC_BSWAP(buf[i]);
1626         }
1627
1628         // NUL-terminate the string if there is any space left
1629         if (res < n)
1630             buf[res] = 0;
1631     }
1632     else
1633     {
1634         // no destination buffer... convert using temp buffer
1635         // to calculate destination buffer requirement
1636         wchar_t tbuf[8];
1637         res = 0;
1638         do {
1639             bufPtr = tbuf;
1640             outbuf = 8*SIZEOF_WCHAR_T;
1641
1642             cres = iconv(m2w,
1643                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1644                          (char**)&bufPtr, &outbuf );
1645
1646             res += 8-(outbuf/SIZEOF_WCHAR_T);
1647         } while ((cres==(size_t)-1) && (errno==E2BIG));
1648     }
1649
1650     if (ICONV_FAILED(cres, inbuf))
1651     {
1652         //VS: it is ok if iconv fails, hence trace only
1653         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1654         return (size_t)-1;
1655     }
1656
1657     return res;
1658 }
1659
1660 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1661 {
1662 #if wxUSE_THREADS
1663     // NB: explained in MB2WC
1664     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1665 #endif
1666
1667     size_t inlen = wxWcslen(psz);
1668     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1669     size_t outbuf = n;
1670     size_t res, cres;
1671
1672     wchar_t *tmpbuf = 0;
1673
1674     if (ms_wcNeedsSwap)
1675     {
1676         // need to copy to temp buffer to switch endianness
1677         // (doing WC_BSWAP twice on the original buffer won't help, as it
1678         //  could be in read-only memory, or be accessed in some other thread)
1679         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1680         for ( size_t i = 0; i < inlen; i++ )
1681             tmpbuf[n] = WC_BSWAP(psz[i]);
1682         tmpbuf[inlen] = L'\0';
1683         psz = tmpbuf;
1684     }
1685
1686     if (buf)
1687     {
1688         // have destination buffer, convert there
1689         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1690
1691         res = n-outbuf;
1692
1693         // NB: iconv was given only wcslen(psz) characters on input, and so
1694         //     it couldn't convert the trailing zero. Let's do it ourselves
1695         //     if there's some room left for it in the output buffer.
1696         if (res < n)
1697             buf[0] = 0;
1698     }
1699     else
1700     {
1701         // no destination buffer... convert using temp buffer
1702         // to calculate destination buffer requirement
1703         char tbuf[16];
1704         res = 0;
1705         do {
1706             buf = tbuf; outbuf = 16;
1707
1708             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1709
1710             res += 16 - outbuf;
1711         } while ((cres==(size_t)-1) && (errno==E2BIG));
1712     }
1713
1714     if (ms_wcNeedsSwap)
1715     {
1716         free(tmpbuf);
1717     }
1718
1719     if (ICONV_FAILED(cres, inbuf))
1720     {
1721         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1722         return (size_t)-1;
1723     }
1724
1725     return res;
1726 }
1727
1728 size_t wxMBConv_iconv::GetMBNulLen() const
1729 {
1730     if ( m_minMBCharWidth == 0 )
1731     {
1732         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1733
1734 #if wxUSE_THREADS
1735         // NB: explained in MB2WC
1736         wxMutexLocker lock(self->m_iconvMutex);
1737 #endif
1738
1739         wchar_t *wnul = L"";
1740         char buf[8]; // should be enough for NUL in any encoding
1741         size_t inLen = sizeof(wchar_t),
1742                outLen = WXSIZEOF(buf);
1743         char *in = (char *)wnul;
1744         char *out = buf;
1745         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1746         {
1747             self->m_minMBCharWidth = (size_t)-1;
1748         }
1749         else // ok
1750         {
1751             self->m_minMBCharWidth = out - buf;
1752         }
1753     }
1754
1755     return m_minMBCharWidth;
1756 }
1757
1758 #endif // HAVE_ICONV
1759
1760
1761 // ============================================================================
1762 // Win32 conversion classes
1763 // ============================================================================
1764
1765 #ifdef wxHAVE_WIN32_MB2WC
1766
1767 // from utils.cpp
1768 #if wxUSE_FONTMAP
1769 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1770 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1771 #endif
1772
1773 class wxMBConv_win32 : public wxMBConv
1774 {
1775 public:
1776     wxMBConv_win32()
1777     {
1778         m_CodePage = CP_ACP;
1779         m_minMBCharWidth = 0;
1780     }
1781
1782 #if wxUSE_FONTMAP
1783     wxMBConv_win32(const wxChar* name)
1784     {
1785         m_CodePage = wxCharsetToCodepage(name);
1786         m_minMBCharWidth = 0;
1787     }
1788
1789     wxMBConv_win32(wxFontEncoding encoding)
1790     {
1791         m_CodePage = wxEncodingToCodepage(encoding);
1792         m_minMBCharWidth = 0;
1793     }
1794 #endif // wxUSE_FONTMAP
1795
1796     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1797     {
1798         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1799         // the behaviour is not compatible with the Unix version (using iconv)
1800         // and break the library itself, e.g. wxTextInputStream::NextChar()
1801         // wouldn't work if reading an incomplete MB char didn't result in an
1802         // error
1803         //
1804         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1805         // an error (tested under Windows Server 2003) and apparently it is
1806         // done on purpose, i.e. the function accepts any input in this case
1807         // and although I'd prefer to return error on ill-formed output, our
1808         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1809         // explicitly ill-formed according to RFC 2152) neither so we don't
1810         // even have any fallback here...
1811         //
1812         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1813         // Win XP or newer and if it is specified on older versions, conversion
1814         // from CP_UTF8 (which can have flags only 0 or MB_ERR_INVALID_CHARS)
1815         // fails. So we can only use the flag on newer Windows versions.
1816         // Additionally, the flag is not supported by UTF7, symbol and CJK
1817         // encodings. See here:
1818         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1819         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1820         int flags = 0;
1821         if ( m_CodePage != CP_UTF7 && m_CodePage != CP_SYMBOL &&
1822              m_CodePage < 50000 &&
1823              IsAtLeastWin2kSP4() )
1824         {
1825             flags = MB_ERR_INVALID_CHARS;
1826         }
1827         else if ( m_CodePage == CP_UTF8 )
1828         {
1829             // Avoid round-trip in the special case of UTF-8 by using our
1830             // own UTF-8 conversion code:
1831             return wxMBConvUTF8().MB2WC(buf, psz, n);
1832         }
1833
1834         const size_t len = ::MultiByteToWideChar
1835                              (
1836                                 m_CodePage,     // code page
1837                                 flags,          // flags: fall on error
1838                                 psz,            // input string
1839                                 -1,             // its length (NUL-terminated)
1840                                 buf,            // output string
1841                                 buf ? n : 0     // size of output buffer
1842                              );
1843         if ( !len )
1844         {
1845             // function totally failed
1846             return (size_t)-1;
1847         }
1848
1849         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1850         // check if we succeeded, by doing a double trip:
1851         if ( !flags && buf )
1852         {
1853             const size_t mbLen = strlen(psz);
1854             wxCharBuffer mbBuf(mbLen);
1855             if ( ::WideCharToMultiByte
1856                    (
1857                       m_CodePage,
1858                       0,
1859                       buf,
1860                       -1,
1861                       mbBuf.data(),
1862                       mbLen + 1,        // size in bytes, not length
1863                       NULL,
1864                       NULL
1865                    ) == 0 ||
1866                   strcmp(mbBuf, psz) != 0 )
1867             {
1868                 // we didn't obtain the same thing we started from, hence
1869                 // the conversion was lossy and we consider that it failed
1870                 return (size_t)-1;
1871             }
1872         }
1873
1874         // note that it returns count of written chars for buf != NULL and size
1875         // of the needed buffer for buf == NULL so in either case the length of
1876         // the string (which never includes the terminating NUL) is one less
1877         return len - 1;
1878     }
1879
1880     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1881     {
1882         /*
1883             we have a problem here: by default, WideCharToMultiByte() may
1884             replace characters unrepresentable in the target code page with bad
1885             quality approximations such as turning "1/2" symbol (U+00BD) into
1886             "1" for the code pages which don't have it and we, obviously, want
1887             to avoid this at any price
1888
1889             the trouble is that this function does it _silently_, i.e. it won't
1890             even tell us whether it did or not... Win98/2000 and higher provide
1891             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1892             we have to resort to a round trip, i.e. check that converting back
1893             results in the same string -- this is, of course, expensive but
1894             otherwise we simply can't be sure to not garble the data.
1895          */
1896
1897         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1898         // it doesn't work with CJK encodings (which we test for rather roughly
1899         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1900         // supporting it
1901         BOOL usedDef wxDUMMY_INITIALIZE(false);
1902         BOOL *pUsedDef;
1903         int flags;
1904         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1905         {
1906             // it's our lucky day
1907             flags = WC_NO_BEST_FIT_CHARS;
1908             pUsedDef = &usedDef;
1909         }
1910         else // old system or unsupported encoding
1911         {
1912             flags = 0;
1913             pUsedDef = NULL;
1914         }
1915
1916         const size_t len = ::WideCharToMultiByte
1917                              (
1918                                 m_CodePage,     // code page
1919                                 flags,          // either none or no best fit
1920                                 pwz,            // input string
1921                                 -1,             // it is (wide) NUL-terminated
1922                                 buf,            // output buffer
1923                                 buf ? n : 0,    // and its size
1924                                 NULL,           // default "replacement" char
1925                                 pUsedDef        // [out] was it used?
1926                              );
1927
1928         if ( !len )
1929         {
1930             // function totally failed
1931             return (size_t)-1;
1932         }
1933
1934         // if we were really converting, check if we succeeded
1935         if ( buf )
1936         {
1937             if ( flags )
1938             {
1939                 // check if the conversion failed, i.e. if any replacements
1940                 // were done
1941                 if ( usedDef )
1942                     return (size_t)-1;
1943             }
1944             else // we must resort to double tripping...
1945             {
1946                 wxWCharBuffer wcBuf(n);
1947                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1948                         wcscmp(wcBuf, pwz) != 0 )
1949                 {
1950                     // we didn't obtain the same thing we started from, hence
1951                     // the conversion was lossy and we consider that it failed
1952                     return (size_t)-1;
1953                 }
1954             }
1955         }
1956
1957         // see the comment above for the reason of "len - 1"
1958         return len - 1;
1959     }
1960
1961     virtual size_t GetMBNulLen() const
1962     {
1963         if ( m_minMBCharWidth == 0 )
1964         {
1965             int len = ::WideCharToMultiByte
1966                         (
1967                             m_CodePage,     // code page
1968                             0,              // no flags
1969                             L"",            // input string
1970                             1,              // translate just the NUL
1971                             NULL,           // output buffer
1972                             0,              // and its size
1973                             NULL,           // no replacement char
1974                             NULL            // [out] don't care if it was used
1975                         );
1976
1977             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1978             switch ( len )
1979             {
1980                 default:
1981                     wxLogDebug(_T("Unexpected NUL length %d"), len);
1982                     // fall through
1983
1984                 case 0:
1985                     self->m_minMBCharWidth = (size_t)-1;
1986                     break;
1987
1988                 case 1:
1989                 case 2:
1990                 case 4:
1991                     self->m_minMBCharWidth = len;
1992                     break;
1993             }
1994         }
1995
1996         return m_minMBCharWidth;
1997     }
1998
1999     bool IsOk() const { return m_CodePage != -1; }
2000
2001 private:
2002     static bool CanUseNoBestFit()
2003     {
2004         static int s_isWin98Or2k = -1;
2005
2006         if ( s_isWin98Or2k == -1 )
2007         {
2008             int verMaj, verMin;
2009             switch ( wxGetOsVersion(&verMaj, &verMin) )
2010             {
2011                 case wxWIN95:
2012                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2013                     break;
2014
2015                 case wxWINDOWS_NT:
2016                     s_isWin98Or2k = verMaj >= 5;
2017                     break;
2018
2019                 default:
2020                     // unknown, be conseravtive by default
2021                     s_isWin98Or2k = 0;
2022             }
2023
2024             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2025         }
2026
2027         return s_isWin98Or2k == 1;
2028     }
2029
2030     static bool IsAtLeastWin2kSP4()
2031     {
2032 #ifdef __WXWINCE__
2033         return false;
2034 #else
2035         static int s_isAtLeastWin2kSP4 = -1;
2036
2037         if ( s_isAtLeastWin2kSP4 == -1 )
2038         {
2039             OSVERSIONINFOEX ver;
2040
2041             memset(&ver, 0, sizeof(ver));
2042             ver.dwOSVersionInfoSize = sizeof(ver);
2043             GetVersionEx((OSVERSIONINFO*)&ver);
2044
2045             s_isAtLeastWin2kSP4 =
2046               ((ver.dwMajorVersion > 5) || // Vista+
2047                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2048                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2049                ver.wServicePackMajor >= 4)) // 2000 SP4+
2050               ? 1 : 0;
2051         }
2052
2053         return s_isAtLeastWin2kSP4 == 1;
2054 #endif
2055     }
2056
2057
2058     // the code page we're working with
2059     long m_CodePage;
2060
2061     // cached result of GetMBNulLen(), set to 0 initially meaning
2062     // "unknown"
2063     size_t m_minMBCharWidth;
2064 };
2065
2066 #endif // wxHAVE_WIN32_MB2WC
2067
2068 // ============================================================================
2069 // Cocoa conversion classes
2070 // ============================================================================
2071
2072 #if defined(__WXCOCOA__)
2073
2074 // RN:  There is no UTF-32 support in either Core Foundation or
2075 // Cocoa.  Strangely enough, internally Core Foundation uses
2076 // UTF 32 internally quite a bit - its just not public (yet).
2077
2078 #include <CoreFoundation/CFString.h>
2079 #include <CoreFoundation/CFStringEncodingExt.h>
2080
2081 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2082 {
2083     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2084     if ( encoding == wxFONTENCODING_DEFAULT )
2085     {
2086         enc = CFStringGetSystemEncoding();
2087     }
2088     else switch( encoding)
2089     {
2090         case wxFONTENCODING_ISO8859_1 :
2091             enc = kCFStringEncodingISOLatin1 ;
2092             break ;
2093         case wxFONTENCODING_ISO8859_2 :
2094             enc = kCFStringEncodingISOLatin2;
2095             break ;
2096         case wxFONTENCODING_ISO8859_3 :
2097             enc = kCFStringEncodingISOLatin3 ;
2098             break ;
2099         case wxFONTENCODING_ISO8859_4 :
2100             enc = kCFStringEncodingISOLatin4;
2101             break ;
2102         case wxFONTENCODING_ISO8859_5 :
2103             enc = kCFStringEncodingISOLatinCyrillic;
2104             break ;
2105         case wxFONTENCODING_ISO8859_6 :
2106             enc = kCFStringEncodingISOLatinArabic;
2107             break ;
2108         case wxFONTENCODING_ISO8859_7 :
2109             enc = kCFStringEncodingISOLatinGreek;
2110             break ;
2111         case wxFONTENCODING_ISO8859_8 :
2112             enc = kCFStringEncodingISOLatinHebrew;
2113             break ;
2114         case wxFONTENCODING_ISO8859_9 :
2115             enc = kCFStringEncodingISOLatin5;
2116             break ;
2117         case wxFONTENCODING_ISO8859_10 :
2118             enc = kCFStringEncodingISOLatin6;
2119             break ;
2120         case wxFONTENCODING_ISO8859_11 :
2121             enc = kCFStringEncodingISOLatinThai;
2122             break ;
2123         case wxFONTENCODING_ISO8859_13 :
2124             enc = kCFStringEncodingISOLatin7;
2125             break ;
2126         case wxFONTENCODING_ISO8859_14 :
2127             enc = kCFStringEncodingISOLatin8;
2128             break ;
2129         case wxFONTENCODING_ISO8859_15 :
2130             enc = kCFStringEncodingISOLatin9;
2131             break ;
2132
2133         case wxFONTENCODING_KOI8 :
2134             enc = kCFStringEncodingKOI8_R;
2135             break ;
2136         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2137             enc = kCFStringEncodingDOSRussian;
2138             break ;
2139
2140 //      case wxFONTENCODING_BULGARIAN :
2141 //          enc = ;
2142 //          break ;
2143
2144         case wxFONTENCODING_CP437 :
2145             enc =kCFStringEncodingDOSLatinUS ;
2146             break ;
2147         case wxFONTENCODING_CP850 :
2148             enc = kCFStringEncodingDOSLatin1;
2149             break ;
2150         case wxFONTENCODING_CP852 :
2151             enc = kCFStringEncodingDOSLatin2;
2152             break ;
2153         case wxFONTENCODING_CP855 :
2154             enc = kCFStringEncodingDOSCyrillic;
2155             break ;
2156         case wxFONTENCODING_CP866 :
2157             enc =kCFStringEncodingDOSRussian ;
2158             break ;
2159         case wxFONTENCODING_CP874 :
2160             enc = kCFStringEncodingDOSThai;
2161             break ;
2162         case wxFONTENCODING_CP932 :
2163             enc = kCFStringEncodingDOSJapanese;
2164             break ;
2165         case wxFONTENCODING_CP936 :
2166             enc =kCFStringEncodingDOSChineseSimplif ;
2167             break ;
2168         case wxFONTENCODING_CP949 :
2169             enc = kCFStringEncodingDOSKorean;
2170             break ;
2171         case wxFONTENCODING_CP950 :
2172             enc = kCFStringEncodingDOSChineseTrad;
2173             break ;
2174         case wxFONTENCODING_CP1250 :
2175             enc = kCFStringEncodingWindowsLatin2;
2176             break ;
2177         case wxFONTENCODING_CP1251 :
2178             enc =kCFStringEncodingWindowsCyrillic ;
2179             break ;
2180         case wxFONTENCODING_CP1252 :
2181             enc =kCFStringEncodingWindowsLatin1 ;
2182             break ;
2183         case wxFONTENCODING_CP1253 :
2184             enc = kCFStringEncodingWindowsGreek;
2185             break ;
2186         case wxFONTENCODING_CP1254 :
2187             enc = kCFStringEncodingWindowsLatin5;
2188             break ;
2189         case wxFONTENCODING_CP1255 :
2190             enc =kCFStringEncodingWindowsHebrew ;
2191             break ;
2192         case wxFONTENCODING_CP1256 :
2193             enc =kCFStringEncodingWindowsArabic ;
2194             break ;
2195         case wxFONTENCODING_CP1257 :
2196             enc = kCFStringEncodingWindowsBalticRim;
2197             break ;
2198 //   This only really encodes to UTF7 (if that) evidently
2199 //        case wxFONTENCODING_UTF7 :
2200 //            enc = kCFStringEncodingNonLossyASCII ;
2201 //            break ;
2202         case wxFONTENCODING_UTF8 :
2203             enc = kCFStringEncodingUTF8 ;
2204             break ;
2205         case wxFONTENCODING_EUC_JP :
2206             enc = kCFStringEncodingEUC_JP;
2207             break ;
2208         case wxFONTENCODING_UTF16 :
2209             enc = kCFStringEncodingUnicode ;
2210             break ;
2211         case wxFONTENCODING_MACROMAN :
2212             enc = kCFStringEncodingMacRoman ;
2213             break ;
2214         case wxFONTENCODING_MACJAPANESE :
2215             enc = kCFStringEncodingMacJapanese ;
2216             break ;
2217         case wxFONTENCODING_MACCHINESETRAD :
2218             enc = kCFStringEncodingMacChineseTrad ;
2219             break ;
2220         case wxFONTENCODING_MACKOREAN :
2221             enc = kCFStringEncodingMacKorean ;
2222             break ;
2223         case wxFONTENCODING_MACARABIC :
2224             enc = kCFStringEncodingMacArabic ;
2225             break ;
2226         case wxFONTENCODING_MACHEBREW :
2227             enc = kCFStringEncodingMacHebrew ;
2228             break ;
2229         case wxFONTENCODING_MACGREEK :
2230             enc = kCFStringEncodingMacGreek ;
2231             break ;
2232         case wxFONTENCODING_MACCYRILLIC :
2233             enc = kCFStringEncodingMacCyrillic ;
2234             break ;
2235         case wxFONTENCODING_MACDEVANAGARI :
2236             enc = kCFStringEncodingMacDevanagari ;
2237             break ;
2238         case wxFONTENCODING_MACGURMUKHI :
2239             enc = kCFStringEncodingMacGurmukhi ;
2240             break ;
2241         case wxFONTENCODING_MACGUJARATI :
2242             enc = kCFStringEncodingMacGujarati ;
2243             break ;
2244         case wxFONTENCODING_MACORIYA :
2245             enc = kCFStringEncodingMacOriya ;
2246             break ;
2247         case wxFONTENCODING_MACBENGALI :
2248             enc = kCFStringEncodingMacBengali ;
2249             break ;
2250         case wxFONTENCODING_MACTAMIL :
2251             enc = kCFStringEncodingMacTamil ;
2252             break ;
2253         case wxFONTENCODING_MACTELUGU :
2254             enc = kCFStringEncodingMacTelugu ;
2255             break ;
2256         case wxFONTENCODING_MACKANNADA :
2257             enc = kCFStringEncodingMacKannada ;
2258             break ;
2259         case wxFONTENCODING_MACMALAJALAM :
2260             enc = kCFStringEncodingMacMalayalam ;
2261             break ;
2262         case wxFONTENCODING_MACSINHALESE :
2263             enc = kCFStringEncodingMacSinhalese ;
2264             break ;
2265         case wxFONTENCODING_MACBURMESE :
2266             enc = kCFStringEncodingMacBurmese ;
2267             break ;
2268         case wxFONTENCODING_MACKHMER :
2269             enc = kCFStringEncodingMacKhmer ;
2270             break ;
2271         case wxFONTENCODING_MACTHAI :
2272             enc = kCFStringEncodingMacThai ;
2273             break ;
2274         case wxFONTENCODING_MACLAOTIAN :
2275             enc = kCFStringEncodingMacLaotian ;
2276             break ;
2277         case wxFONTENCODING_MACGEORGIAN :
2278             enc = kCFStringEncodingMacGeorgian ;
2279             break ;
2280         case wxFONTENCODING_MACARMENIAN :
2281             enc = kCFStringEncodingMacArmenian ;
2282             break ;
2283         case wxFONTENCODING_MACCHINESESIMP :
2284             enc = kCFStringEncodingMacChineseSimp ;
2285             break ;
2286         case wxFONTENCODING_MACTIBETAN :
2287             enc = kCFStringEncodingMacTibetan ;
2288             break ;
2289         case wxFONTENCODING_MACMONGOLIAN :
2290             enc = kCFStringEncodingMacMongolian ;
2291             break ;
2292         case wxFONTENCODING_MACETHIOPIC :
2293             enc = kCFStringEncodingMacEthiopic ;
2294             break ;
2295         case wxFONTENCODING_MACCENTRALEUR :
2296             enc = kCFStringEncodingMacCentralEurRoman ;
2297             break ;
2298         case wxFONTENCODING_MACVIATNAMESE :
2299             enc = kCFStringEncodingMacVietnamese ;
2300             break ;
2301         case wxFONTENCODING_MACARABICEXT :
2302             enc = kCFStringEncodingMacExtArabic ;
2303             break ;
2304         case wxFONTENCODING_MACSYMBOL :
2305             enc = kCFStringEncodingMacSymbol ;
2306             break ;
2307         case wxFONTENCODING_MACDINGBATS :
2308             enc = kCFStringEncodingMacDingbats ;
2309             break ;
2310         case wxFONTENCODING_MACTURKISH :
2311             enc = kCFStringEncodingMacTurkish ;
2312             break ;
2313         case wxFONTENCODING_MACCROATIAN :
2314             enc = kCFStringEncodingMacCroatian ;
2315             break ;
2316         case wxFONTENCODING_MACICELANDIC :
2317             enc = kCFStringEncodingMacIcelandic ;
2318             break ;
2319         case wxFONTENCODING_MACROMANIAN :
2320             enc = kCFStringEncodingMacRomanian ;
2321             break ;
2322         case wxFONTENCODING_MACCELTIC :
2323             enc = kCFStringEncodingMacCeltic ;
2324             break ;
2325         case wxFONTENCODING_MACGAELIC :
2326             enc = kCFStringEncodingMacGaelic ;
2327             break ;
2328 //      case wxFONTENCODING_MACKEYBOARD :
2329 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2330 //          break ;
2331         default :
2332             // because gcc is picky
2333             break ;
2334     } ;
2335     return enc ;
2336 }
2337
2338 class wxMBConv_cocoa : public wxMBConv
2339 {
2340 public:
2341     wxMBConv_cocoa()
2342     {
2343         Init(CFStringGetSystemEncoding()) ;
2344     }
2345
2346 #if wxUSE_FONTMAP
2347     wxMBConv_cocoa(const wxChar* name)
2348     {
2349         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2350     }
2351 #endif
2352
2353     wxMBConv_cocoa(wxFontEncoding encoding)
2354     {
2355         Init( wxCFStringEncFromFontEnc(encoding) );
2356     }
2357
2358     ~wxMBConv_cocoa()
2359     {
2360     }
2361
2362     void Init( CFStringEncoding encoding)
2363     {
2364         m_encoding = encoding ;
2365     }
2366
2367     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2368     {
2369         wxASSERT(szUnConv);
2370
2371         CFStringRef theString = CFStringCreateWithBytes (
2372                                                 NULL, //the allocator
2373                                                 (const UInt8*)szUnConv,
2374                                                 strlen(szUnConv),
2375                                                 m_encoding,
2376                                                 false //no BOM/external representation
2377                                                 );
2378
2379         wxASSERT(theString);
2380
2381         size_t nOutLength = CFStringGetLength(theString);
2382
2383         if (szOut == NULL)
2384         {
2385             CFRelease(theString);
2386             return nOutLength;
2387         }
2388
2389         CFRange theRange = { 0, nOutSize };
2390
2391 #if SIZEOF_WCHAR_T == 4
2392         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2393 #endif
2394
2395         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2396
2397         CFRelease(theString);
2398
2399         szUniCharBuffer[nOutLength] = '\0' ;
2400
2401 #if SIZEOF_WCHAR_T == 4
2402         wxMBConvUTF16 converter ;
2403         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2404         delete[] szUniCharBuffer;
2405 #endif
2406
2407         return nOutLength;
2408     }
2409
2410     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2411     {
2412         wxASSERT(szUnConv);
2413
2414         size_t nRealOutSize;
2415         size_t nBufSize = wxWcslen(szUnConv);
2416         UniChar* szUniBuffer = (UniChar*) szUnConv;
2417
2418 #if SIZEOF_WCHAR_T == 4
2419         wxMBConvUTF16 converter ;
2420         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2421         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2422         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2423         nBufSize /= sizeof(UniChar);
2424 #endif
2425
2426         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2427                                 NULL, //allocator
2428                                 szUniBuffer,
2429                                 nBufSize,
2430                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2431                             );
2432
2433         wxASSERT(theString);
2434
2435         //Note that CER puts a BOM when converting to unicode
2436         //so we  check and use getchars instead in that case
2437         if (m_encoding == kCFStringEncodingUnicode)
2438         {
2439             if (szOut != NULL)
2440                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2441
2442             nRealOutSize = CFStringGetLength(theString) + 1;
2443         }
2444         else
2445         {
2446             CFStringGetBytes(
2447                 theString,
2448                 CFRangeMake(0, CFStringGetLength(theString)),
2449                 m_encoding,
2450                 0, //what to put in characters that can't be converted -
2451                     //0 tells CFString to return NULL if it meets such a character
2452                 false, //not an external representation
2453                 (UInt8*) szOut,
2454                 nOutSize,
2455                 (CFIndex*) &nRealOutSize
2456                         );
2457         }
2458
2459         CFRelease(theString);
2460
2461 #if SIZEOF_WCHAR_T == 4
2462         delete[] szUniBuffer;
2463 #endif
2464
2465         return  nRealOutSize - 1;
2466     }
2467
2468     bool IsOk() const
2469     {
2470         return m_encoding != kCFStringEncodingInvalidId &&
2471               CFStringIsEncodingAvailable(m_encoding);
2472     }
2473
2474 private:
2475     CFStringEncoding m_encoding ;
2476 };
2477
2478 #endif // defined(__WXCOCOA__)
2479
2480 // ============================================================================
2481 // Mac conversion classes
2482 // ============================================================================
2483
2484 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2485
2486 class wxMBConv_mac : public wxMBConv
2487 {
2488 public:
2489     wxMBConv_mac()
2490     {
2491         Init(CFStringGetSystemEncoding()) ;
2492     }
2493
2494 #if wxUSE_FONTMAP
2495     wxMBConv_mac(const wxChar* name)
2496     {
2497         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2498     }
2499 #endif
2500
2501     wxMBConv_mac(wxFontEncoding encoding)
2502     {
2503         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2504     }
2505
2506     ~wxMBConv_mac()
2507     {
2508         OSStatus status = noErr ;
2509         status = TECDisposeConverter(m_MB2WC_converter);
2510         status = TECDisposeConverter(m_WC2MB_converter);
2511     }
2512
2513
2514     void Init( TextEncodingBase encoding)
2515     {
2516         OSStatus status = noErr ;
2517         m_char_encoding = encoding ;
2518         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2519
2520         status = TECCreateConverter(&m_MB2WC_converter,
2521                                     m_char_encoding,
2522                                     m_unicode_encoding);
2523         status = TECCreateConverter(&m_WC2MB_converter,
2524                                     m_unicode_encoding,
2525                                     m_char_encoding);
2526     }
2527
2528     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2529     {
2530         OSStatus status = noErr ;
2531         ByteCount byteOutLen ;
2532         ByteCount byteInLen = strlen(psz) ;
2533         wchar_t *tbuf = NULL ;
2534         UniChar* ubuf = NULL ;
2535         size_t res = 0 ;
2536
2537         if (buf == NULL)
2538         {
2539             //apple specs say at least 32
2540             n = wxMax( 32 , byteInLen ) ;
2541             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2542         }
2543         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2544 #if SIZEOF_WCHAR_T == 4
2545         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2546 #else
2547         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2548 #endif
2549         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2550           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2551 #if SIZEOF_WCHAR_T == 4
2552         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2553         // is not properly terminated we get random characters at the end
2554         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2555         wxMBConvUTF16 converter ;
2556         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2557         free( ubuf ) ;
2558 #else
2559         res = byteOutLen / sizeof( UniChar ) ;
2560 #endif
2561         if ( buf == NULL )
2562              free(tbuf) ;
2563
2564         if ( buf  && res < n)
2565             buf[res] = 0;
2566
2567         return res ;
2568     }
2569
2570     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2571     {
2572         OSStatus status = noErr ;
2573         ByteCount byteOutLen ;
2574         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2575
2576         char *tbuf = NULL ;
2577
2578         if (buf == NULL)
2579         {
2580             //apple specs say at least 32
2581             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2582             tbuf = (char*) malloc( n ) ;
2583         }
2584
2585         ByteCount byteBufferLen = n ;
2586         UniChar* ubuf = NULL ;
2587 #if SIZEOF_WCHAR_T == 4
2588         wxMBConvUTF16 converter ;
2589         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2590         byteInLen = unicharlen ;
2591         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2592         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2593 #else
2594         ubuf = (UniChar*) psz ;
2595 #endif
2596         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2597             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2598 #if SIZEOF_WCHAR_T == 4
2599         free( ubuf ) ;
2600 #endif
2601         if ( buf == NULL )
2602             free(tbuf) ;
2603
2604         size_t res = byteOutLen ;
2605         if ( buf  && res < n)
2606         {
2607             buf[res] = 0;
2608
2609             //we need to double-trip to verify it didn't insert any ? in place
2610             //of bogus characters
2611             wxWCharBuffer wcBuf(n);
2612             size_t pszlen = wxWcslen(psz);
2613             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2614                         wxWcslen(wcBuf) != pszlen ||
2615                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2616             {
2617                 // we didn't obtain the same thing we started from, hence
2618                 // the conversion was lossy and we consider that it failed
2619                 return (size_t)-1;
2620             }
2621         }
2622
2623         return res ;
2624     }
2625
2626     bool IsOk() const
2627         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2628
2629 private:
2630     TECObjectRef m_MB2WC_converter ;
2631     TECObjectRef m_WC2MB_converter ;
2632
2633     TextEncodingBase m_char_encoding ;
2634     TextEncodingBase m_unicode_encoding ;
2635 };
2636
2637 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2638
2639 // ============================================================================
2640 // wxEncodingConverter based conversion classes
2641 // ============================================================================
2642
2643 #if wxUSE_FONTMAP
2644
2645 class wxMBConv_wxwin : public wxMBConv
2646 {
2647 private:
2648     void Init()
2649     {
2650         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2651                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2652     }
2653
2654 public:
2655     // temporarily just use wxEncodingConverter stuff,
2656     // so that it works while a better implementation is built
2657     wxMBConv_wxwin(const wxChar* name)
2658     {
2659         if (name)
2660             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2661         else
2662             m_enc = wxFONTENCODING_SYSTEM;
2663
2664         Init();
2665     }
2666
2667     wxMBConv_wxwin(wxFontEncoding enc)
2668     {
2669         m_enc = enc;
2670
2671         Init();
2672     }
2673
2674     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2675     {
2676         size_t inbuf = strlen(psz);
2677         if (buf)
2678         {
2679             if (!m2w.Convert(psz,buf))
2680                 return (size_t)-1;
2681         }
2682         return inbuf;
2683     }
2684
2685     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2686     {
2687         const size_t inbuf = wxWcslen(psz);
2688         if (buf)
2689         {
2690             if (!w2m.Convert(psz,buf))
2691                 return (size_t)-1;
2692         }
2693
2694         return inbuf;
2695     }
2696
2697     virtual size_t GetMBNulLen() const
2698     {
2699         switch ( m_enc )
2700         {
2701             case wxFONTENCODING_UTF16BE:
2702             case wxFONTENCODING_UTF16LE:
2703                 return 2;
2704
2705             case wxFONTENCODING_UTF32BE:
2706             case wxFONTENCODING_UTF32LE:
2707                 return 4;
2708
2709             default:
2710                 return 1;
2711         }
2712     }
2713
2714     bool IsOk() const { return m_ok; }
2715
2716 public:
2717     wxFontEncoding m_enc;
2718     wxEncodingConverter m2w, w2m;
2719
2720 private:
2721     // were we initialized successfully?
2722     bool m_ok;
2723
2724     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2725 };
2726
2727 // make the constructors available for unit testing
2728 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2729 {
2730     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2731     if ( !result->IsOk() )
2732     {
2733         delete result;
2734         return 0;
2735     }
2736     return result;
2737 }
2738
2739 #endif // wxUSE_FONTMAP
2740
2741 // ============================================================================
2742 // wxCSConv implementation
2743 // ============================================================================
2744
2745 void wxCSConv::Init()
2746 {
2747     m_name = NULL;
2748     m_convReal =  NULL;
2749     m_deferred = true;
2750 }
2751
2752 wxCSConv::wxCSConv(const wxChar *charset)
2753 {
2754     Init();
2755
2756     if ( charset )
2757     {
2758         SetName(charset);
2759     }
2760
2761 #if wxUSE_FONTMAP
2762     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2763 #else
2764     m_encoding = wxFONTENCODING_SYSTEM;
2765 #endif
2766 }
2767
2768 wxCSConv::wxCSConv(wxFontEncoding encoding)
2769 {
2770     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2771     {
2772         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2773
2774         encoding = wxFONTENCODING_SYSTEM;
2775     }
2776
2777     Init();
2778
2779     m_encoding = encoding;
2780 }
2781
2782 wxCSConv::~wxCSConv()
2783 {
2784     Clear();
2785 }
2786
2787 wxCSConv::wxCSConv(const wxCSConv& conv)
2788         : wxMBConv()
2789 {
2790     Init();
2791
2792     SetName(conv.m_name);
2793     m_encoding = conv.m_encoding;
2794 }
2795
2796 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2797 {
2798     Clear();
2799
2800     SetName(conv.m_name);
2801     m_encoding = conv.m_encoding;
2802
2803     return *this;
2804 }
2805
2806 void wxCSConv::Clear()
2807 {
2808     free(m_name);
2809     delete m_convReal;
2810
2811     m_name = NULL;
2812     m_convReal = NULL;
2813 }
2814
2815 void wxCSConv::SetName(const wxChar *charset)
2816 {
2817     if (charset)
2818     {
2819         m_name = wxStrdup(charset);
2820         m_deferred = true;
2821     }
2822 }
2823
2824 #if wxUSE_FONTMAP
2825 #include "wx/hashmap.h"
2826
2827 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2828                      wxEncodingNameCache );
2829
2830 static wxEncodingNameCache gs_nameCache;
2831 #endif
2832
2833 wxMBConv *wxCSConv::DoCreate() const
2834 {
2835 #if wxUSE_FONTMAP
2836     wxLogTrace(TRACE_STRCONV,
2837                wxT("creating conversion for %s"),
2838                (m_name ? m_name
2839                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2840 #endif // wxUSE_FONTMAP
2841
2842     // check for the special case of ASCII or ISO8859-1 charset: as we have
2843     // special knowledge of it anyhow, we don't need to create a special
2844     // conversion object
2845     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2846             m_encoding == wxFONTENCODING_DEFAULT )
2847     {
2848         // don't convert at all
2849         return NULL;
2850     }
2851
2852     // we trust OS to do conversion better than we can so try external
2853     // conversion methods first
2854     //
2855     // the full order is:
2856     //      1. OS conversion (iconv() under Unix or Win32 API)
2857     //      2. hard coded conversions for UTF
2858     //      3. wxEncodingConverter as fall back
2859
2860     // step (1)
2861 #ifdef HAVE_ICONV
2862 #if !wxUSE_FONTMAP
2863     if ( m_name )
2864 #endif // !wxUSE_FONTMAP
2865     {
2866         wxString name(m_name);
2867         wxFontEncoding encoding(m_encoding);
2868
2869         if ( !name.empty() )
2870         {
2871             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2872             if ( conv->IsOk() )
2873                 return conv;
2874
2875             delete conv;
2876
2877 #if wxUSE_FONTMAP
2878             encoding =
2879                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2880 #endif // wxUSE_FONTMAP
2881         }
2882 #if wxUSE_FONTMAP
2883         {
2884             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2885             if ( it != gs_nameCache.end() )
2886             {
2887                 if ( it->second.empty() )
2888                     return NULL;
2889
2890                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2891                 if ( conv->IsOk() )
2892                     return conv;
2893
2894                 delete conv;
2895             }
2896
2897             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2898
2899             for ( ; *names; ++names )
2900             {
2901                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2902                 if ( conv->IsOk() )
2903                 {
2904                     gs_nameCache[encoding] = *names;
2905                     return conv;
2906                 }
2907
2908                 delete conv;
2909             }
2910
2911             gs_nameCache[encoding] = _T(""); // cache the failure
2912         }
2913 #endif // wxUSE_FONTMAP
2914     }
2915 #endif // HAVE_ICONV
2916
2917 #ifdef wxHAVE_WIN32_MB2WC
2918     {
2919 #if wxUSE_FONTMAP
2920         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2921                                       : new wxMBConv_win32(m_encoding);
2922         if ( conv->IsOk() )
2923             return conv;
2924
2925         delete conv;
2926 #else
2927         return NULL;
2928 #endif
2929     }
2930 #endif // wxHAVE_WIN32_MB2WC
2931 #if defined(__WXMAC__)
2932     {
2933         // leave UTF16 and UTF32 to the built-ins of wx
2934         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2935             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2936         {
2937
2938 #if wxUSE_FONTMAP
2939             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2940                                         : new wxMBConv_mac(m_encoding);
2941 #else
2942             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2943 #endif
2944             if ( conv->IsOk() )
2945                  return conv;
2946
2947             delete conv;
2948         }
2949     }
2950 #endif
2951 #if defined(__WXCOCOA__)
2952     {
2953         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2954         {
2955
2956 #if wxUSE_FONTMAP
2957             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2958                                           : new wxMBConv_cocoa(m_encoding);
2959 #else
2960             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2961 #endif
2962             if ( conv->IsOk() )
2963                  return conv;
2964
2965             delete conv;
2966         }
2967     }
2968 #endif
2969     // step (2)
2970     wxFontEncoding enc = m_encoding;
2971 #if wxUSE_FONTMAP
2972     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2973     {
2974         // use "false" to suppress interactive dialogs -- we can be called from
2975         // anywhere and popping up a dialog from here is the last thing we want to
2976         // do
2977         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2978     }
2979 #endif // wxUSE_FONTMAP
2980
2981     switch ( enc )
2982     {
2983         case wxFONTENCODING_UTF7:
2984              return new wxMBConvUTF7;
2985
2986         case wxFONTENCODING_UTF8:
2987              return new wxMBConvUTF8;
2988
2989         case wxFONTENCODING_UTF16BE:
2990              return new wxMBConvUTF16BE;
2991
2992         case wxFONTENCODING_UTF16LE:
2993              return new wxMBConvUTF16LE;
2994
2995         case wxFONTENCODING_UTF32BE:
2996              return new wxMBConvUTF32BE;
2997
2998         case wxFONTENCODING_UTF32LE:
2999              return new wxMBConvUTF32LE;
3000
3001         default:
3002              // nothing to do but put here to suppress gcc warnings
3003              ;
3004     }
3005
3006     // step (3)
3007 #if wxUSE_FONTMAP
3008     {
3009         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3010                                       : new wxMBConv_wxwin(m_encoding);
3011         if ( conv->IsOk() )
3012             return conv;
3013
3014         delete conv;
3015     }
3016 #endif // wxUSE_FONTMAP
3017
3018     // NB: This is a hack to prevent deadlock. What could otherwise happen
3019     //     in Unicode build: wxConvLocal creation ends up being here
3020     //     because of some failure and logs the error. But wxLog will try to
3021     //     attach timestamp, for which it will need wxConvLocal (to convert
3022     //     time to char* and then wchar_t*), but that fails, tries to log
3023     //     error, but wxLog has a (already locked) critical section that
3024     //     guards static buffer.
3025     static bool alreadyLoggingError = false;
3026     if (!alreadyLoggingError)
3027     {
3028         alreadyLoggingError = true;
3029         wxLogError(_("Cannot convert from the charset '%s'!"),
3030                    m_name ? m_name
3031                       :
3032 #if wxUSE_FONTMAP
3033                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3034 #else // !wxUSE_FONTMAP
3035                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3036 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3037               );
3038         alreadyLoggingError = false;
3039     }
3040
3041     return NULL;
3042 }
3043
3044 void wxCSConv::CreateConvIfNeeded() const
3045 {
3046     if ( m_deferred )
3047     {
3048         wxCSConv *self = (wxCSConv *)this; // const_cast
3049
3050 #if wxUSE_INTL
3051         // if we don't have neither the name nor the encoding, use the default
3052         // encoding for this system
3053         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3054         {
3055             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3056         }
3057 #endif // wxUSE_INTL
3058
3059         self->m_convReal = DoCreate();
3060         self->m_deferred = false;
3061     }
3062 }
3063
3064 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3065 {
3066     CreateConvIfNeeded();
3067
3068     if (m_convReal)
3069         return m_convReal->MB2WC(buf, psz, n);
3070
3071     // latin-1 (direct)
3072     size_t len = strlen(psz);
3073
3074     if (buf)
3075     {
3076         for (size_t c = 0; c <= len; c++)
3077             buf[c] = (unsigned char)(psz[c]);
3078     }
3079
3080     return len;
3081 }
3082
3083 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3084 {
3085     CreateConvIfNeeded();
3086
3087     if (m_convReal)
3088         return m_convReal->WC2MB(buf, psz, n);
3089
3090     // latin-1 (direct)
3091     const size_t len = wxWcslen(psz);
3092     if (buf)
3093     {
3094         for (size_t c = 0; c <= len; c++)
3095         {
3096             if (psz[c] > 0xFF)
3097                 return (size_t)-1;
3098             buf[c] = (char)psz[c];
3099         }
3100     }
3101     else
3102     {
3103         for (size_t c = 0; c <= len; c++)
3104         {
3105             if (psz[c] > 0xFF)
3106                 return (size_t)-1;
3107         }
3108     }
3109
3110     return len;
3111 }
3112
3113 size_t wxCSConv::GetMBNulLen() const
3114 {
3115     CreateConvIfNeeded();
3116
3117     if ( m_convReal )
3118     {
3119         return m_convReal->GetMBNulLen();
3120     }
3121
3122     return 1;
3123 }
3124
3125 // ----------------------------------------------------------------------------
3126 // globals
3127 // ----------------------------------------------------------------------------
3128
3129 #ifdef __WINDOWS__
3130     static wxMBConv_win32 wxConvLibcObj;
3131 #elif defined(__WXMAC__) && !defined(__MACH__)
3132     static wxMBConv_mac wxConvLibcObj ;
3133 #else
3134     static wxMBConvLibc wxConvLibcObj;
3135 #endif
3136
3137 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3138 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3139 static wxMBConvUTF7 wxConvUTF7Obj;
3140 static wxMBConvUTF8 wxConvUTF8Obj;
3141
3142 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3143 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3144 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3145 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3146 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3147 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3148 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3149 #ifdef __WXOSX__
3150                                     wxConvUTF8Obj;
3151 #else
3152                                     wxConvLibcObj;
3153 #endif
3154
3155
3156 #else // !wxUSE_WCHAR_T
3157
3158 // stand-ins in absence of wchar_t
3159 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3160                                 wxConvISO8859_1,
3161                                 wxConvLocal,
3162                                 wxConvUTF8;
3163
3164 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T