src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  90 static bool NotAllNULs(const char *p, size_t n)
  91 {
  92     while ( n && *p++ == '\0' )
  93         n--;
  94
  95     return n != 0;
  96 }
  97
  98 // ----------------------------------------------------------------------------
  99 // UTF-16 en/decoding to/from UCS-4
 100 // ----------------------------------------------------------------------------
 101
 102
 103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 104 {
 105     if (input<=0xffff)
 106     {
 107         if (output)
 108             *output = (wxUint16) input;
 109         return 1;
 110     }
 111     else if (input>=0x110000)
 112     {
 113         return (size_t)-1;
 114     }
 115     else
 116     {
 117         if (output)
 118         {
 119             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 120             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 121         }
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input<0xd800) || (*input>0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 134     {
 135         output = *input;
 136         return (size_t)-1;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145
 146 // ----------------------------------------------------------------------------
 147 // wxMBConv
 148 // ----------------------------------------------------------------------------
 149
 150 size_t
 151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 152                   const char *src, size_t srcLen) const
 153 {
 154     // although new conversion classes are supposed to implement this function
 155     // directly, the existins ones only implement the old MB2WC() and so, to
 156     // avoid to have to rewrite all conversion classes at once, we provide a
 157     // default (but not efficient) implementation of this one in terms of the
 158     // old function by copying the input to ensure that it's NUL-terminated and
 159     // then using MB2WC() to convert it
 160
 161     // the number of chars [which would be] written to dst [if it were not NULL]
 162     size_t dstWritten = 0;
 163
 164     // the number of NULs terminating this string
 165     size_t nulLen wxDUMMY_INITIALIZE(0);
 166
 167     // if we were not given the input size we just have to assume that the
 168     // string is properly terminated as we have no way of knowing how long it
 169     // is anyhow, but if we do have the size check whether there are enough
 170     // NULs at the end
 171     wxCharBuffer bufTmp;
 172     const char *srcEnd;
 173     if ( srcLen != (size_t)-1 )
 174     {
 175         // we need to know how to find the end of this string
 176         nulLen = GetMBNulLen();
 177         if ( nulLen == wxCONV_FAILED )
 178             return wxCONV_FAILED;
 179
 180         // if there are enough NULs we can avoid the copy
 181         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 182         {
 183             // make a copy in order to properly NUL-terminate the string
 184             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 185             char * const p = bufTmp.data();
 186             memcpy(p, src, srcLen);
 187             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 188                 *s = '\0';
 189
 190             src = bufTmp;
 191         }
 192
 193         srcEnd = src + srcLen;
 194     }
 195     else // quit after the first loop iteration
 196     {
 197         srcEnd = NULL;
 198     }
 199
 200     for ( ;; )
 201     {
 202         // try to convert the current chunk
 203         size_t lenChunk = MB2WC(NULL, src, 0);
 204         if ( lenChunk == 0 )
 205         {
 206             // nothing left in the input string, conversion succeeded; but
 207             // still account for the trailing NULL
 208             dstWritten++;
 209             break;
 210         }
 211
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for trailing NUL
 216
 217         dstWritten += lenChunk;
 218
 219         if ( dst )
 220         {
 221             if ( dstWritten > dstLen )
 222                 return wxCONV_FAILED;
 223
 224             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 225                 return wxCONV_FAILED;
 226
 227             dst += lenChunk;
 228         }
 229
 230         if ( !srcEnd )
 231         {
 232             // we convert the entire string in this cas, as we suppose that the
 233             // string is NUL-terminated and so srcEnd is not used at all
 234             break;
 235         }
 236
 237         // advance the input pointer past the end of this chunk
 238         while ( NotAllNULs(src, nulLen) )
 239         {
 240             // notice that we must skip over multiple bytes here as we suppose
 241             // that if NUL takes 2 or 4 bytes, then all the other characters do
 242             // too and so if advanced by a single byte we might erroneously
 243             // detect sequences of NUL bytes in the middle of the input
 244             src += nulLen;
 245         }
 246
 247         src += nulLen; // skipping over its terminator as well
 248
 249         // note that ">=" (and not just "==") is needed here as the terminator
 250         // we skipped just above could be inside or just after the buffer
 251         // delimited by inEnd
 252         if ( src >= srcEnd )
 253             break;
 254     }
 255
 256     return dstWritten;
 257 }
 258
 259 size_t
 260 wxMBConv::FromWChar(char *dst, size_t dstLen,
 261                     const wchar_t *src, size_t srcLen) const
 262 {
 263     // the number of chars [which would be] written to dst [if it were not NULL]
 264     size_t dstWritten = 0;
 265
 266     // make a copy of the input string unless it is already properly
 267     // NUL-terminated
 268     //
 269     // if we don't know its length we have no choice but to assume that it is,
 270     // indeed, properly terminated
 271     wxWCharBuffer bufTmp;
 272     if ( srcLen == (size_t)-1 )
 273     {
 274         srcLen = wxWcslen(src) + 1;
 275     }
 276     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 277     {
 278         // make a copy in order to properly NUL-terminate the string
 279         bufTmp = wxWCharBuffer(srcLen);
 280         memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
 281         src = bufTmp;
 282     }
 283
 284     const size_t lenNul = GetMBNulLen();
 285     for ( const wchar_t * const srcEnd = src + srcLen;
 286           src < srcEnd;
 287           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 288     {
 289         // try to convert the current chunk
 290         size_t lenChunk = WC2MB(NULL, src, 0);
 291
 292         if ( lenChunk == wxCONV_FAILED )
 293             return wxCONV_FAILED;
 294
 295         lenChunk += lenNul;
 296         dstWritten += lenChunk;
 297
 298         if ( dst )
 299         {
 300             if ( dstWritten > dstLen )
 301                 return wxCONV_FAILED;
 302
 303             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 304                 return wxCONV_FAILED;
 305
 306             dst += lenChunk;
 307         }
 308     }
 309
 310     return dstWritten;
 311 }
 312
 313 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
 314 {
 315     size_t rc = ToWChar(out, outLen, in);
 316     if ( rc != wxCONV_FAILED )
 317     {
 318         // ToWChar() returns the buffer length, i.e. including the trailing
 319         // NUL, while this method doesn't take it into account
 320         rc--;
 321     }
 322
 323     return rc;
 324 }
 325
 326 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
 327 {
 328     size_t rc = FromWChar(out, outLen, in);
 329     if ( rc != wxCONV_FAILED )
 330     {
 331         rc -= GetMBNulLen();
 332     }
 333
 334     return rc;
 335 }
 336
 337 wxMBConv::~wxMBConv()
 338 {
 339     // nothing to do here (necessary for Darwin linking probably)
 340 }
 341
 342 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 343 {
 344     if ( psz )
 345     {
 346         // calculate the length of the buffer needed first
 347         const size_t nLen = MB2WC(NULL, psz, 0);
 348         if ( nLen != wxCONV_FAILED )
 349         {
 350             // now do the actual conversion
 351             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 352
 353             // +1 for the trailing NULL
 354             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 355                 return buf;
 356         }
 357     }
 358
 359     return wxWCharBuffer();
 360 }
 361
 362 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 363 {
 364     if ( pwz )
 365     {
 366         const size_t nLen = WC2MB(NULL, pwz, 0);
 367         if ( nLen != wxCONV_FAILED )
 368         {
 369             // extra space for trailing NUL(s)
 370             static const size_t extraLen = GetMaxMBNulLen();
 371
 372             wxCharBuffer buf(nLen + extraLen - 1);
 373             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 374                 return buf;
 375         }
 376     }
 377
 378     return wxCharBuffer();
 379 }
 380
 381 const wxWCharBuffer
 382 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 383 {
 384     const size_t dstLen = ToWChar(NULL, 0, in, inLen);
 385     if ( dstLen != wxCONV_FAILED )
 386     {
 387         wxWCharBuffer wbuf(dstLen - 1);
 388         if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
 389         {
 390             if ( outLen )
 391                 *outLen = dstLen - 1;
 392             return wbuf;
 393         }
 394     }
 395
 396     if ( outLen )
 397         *outLen = 0;
 398
 399     return wxWCharBuffer();
 400 }
 401
 402 const wxCharBuffer
 403 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 404 {
 405     const size_t dstLen = FromWChar(NULL, 0, in, inLen);
 406     if ( dstLen != wxCONV_FAILED )
 407     {
 408         wxCharBuffer buf(dstLen - 1);
 409         if ( FromWChar(buf.data(), dstLen, in, inLen) )
 410         {
 411             if ( outLen )
 412                 *outLen = dstLen - 1;
 413             return buf;
 414         }
 415     }
 416
 417     if ( outLen )
 418         *outLen = 0;
 419
 420     return wxCharBuffer();
 421 }
 422
 423 // ----------------------------------------------------------------------------
 424 // wxMBConvLibc
 425 // ----------------------------------------------------------------------------
 426
 427 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 428 {
 429     return wxMB2WC(buf, psz, n);
 430 }
 431
 432 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 433 {
 434     return wxWC2MB(buf, psz, n);
 435 }
 436
 437 // ----------------------------------------------------------------------------
 438 // wxConvBrokenFileNames
 439 // ----------------------------------------------------------------------------
 440
 441 #ifdef __UNIX__
 442
 443 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 444 {
 445     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 446                   || wxStricmp(charset, _T("UTF8")) == 0  )
 447         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 448     else
 449         m_conv = new wxCSConv(charset);
 450 }
 451
 452 #endif // __UNIX__
 453
 454 // ----------------------------------------------------------------------------
 455 // UTF-7
 456 // ----------------------------------------------------------------------------
 457
 458 // Implementation (C) 2004 Fredrik Roubert
 459
 460 //
 461 // BASE64 decoding table
 462 //
 463 static const unsigned char utf7unb64[] =
 464 {
 465     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 466     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 467     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 468     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 469     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 470     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 471     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 472     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 473     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 474     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 475     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 476     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 477     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 478     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 479     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 480     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 481     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 482     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 483     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 484     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 485     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 486     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 487     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 488     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 489     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 490     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 497 };
 498
 499 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 500 {
 501     size_t len = 0;
 502
 503     while ( *psz && (!buf || (len < n)) )
 504     {
 505         unsigned char cc = *psz++;
 506         if (cc != '+')
 507         {
 508             // plain ASCII char
 509             if (buf)
 510                 *buf++ = cc;
 511             len++;
 512         }
 513         else if (*psz == '-')
 514         {
 515             // encoded plus sign
 516             if (buf)
 517                 *buf++ = cc;
 518             len++;
 519             psz++;
 520         }
 521         else // start of BASE64 encoded string
 522         {
 523             bool lsb, ok;
 524             unsigned int d, l;
 525             for ( ok = lsb = false, d = 0, l = 0;
 526                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 527                   psz++ )
 528             {
 529                 d <<= 6;
 530                 d += cc;
 531                 for (l += 6; l >= 8; lsb = !lsb)
 532                 {
 533                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 534                     if (lsb)
 535                     {
 536                         if (buf)
 537                             *buf++ |= c;
 538                         len ++;
 539                     }
 540                     else
 541                     {
 542                         if (buf)
 543                             *buf = (wchar_t)(c << 8);
 544                     }
 545
 546                     ok = true;
 547                 }
 548             }
 549
 550             if ( !ok )
 551             {
 552                 // in valid UTF7 we should have valid characters after '+'
 553                 return (size_t)-1;
 554             }
 555
 556             if (*psz == '-')
 557                 psz++;
 558         }
 559     }
 560
 561     if ( buf && (len < n) )
 562         *buf = '\0';
 563
 564     return len;
 565 }
 566
 567 //
 568 // BASE64 encoding table
 569 //
 570 static const unsigned char utf7enb64[] =
 571 {
 572     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 573     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 574     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 575     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 576     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 577     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 578     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 579     '4', '5', '6', '7', '8', '9', '+', '/'
 580 };
 581
 582 //
 583 // UTF-7 encoding table
 584 //
 585 // 0 - Set D (directly encoded characters)
 586 // 1 - Set O (optional direct characters)
 587 // 2 - whitespace characters (optional)
 588 // 3 - special characters
 589 //
 590 static const unsigned char utf7encode[128] =
 591 {
 592     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 593     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 594     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 595     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 596     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 597     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 598     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 599     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 600 };
 601
 602 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 603 {
 604     size_t len = 0;
 605
 606     while (*psz && ((!buf) || (len < n)))
 607     {
 608         wchar_t cc = *psz++;
 609         if (cc < 0x80 && utf7encode[cc] < 1)
 610         {
 611             // plain ASCII char
 612             if (buf)
 613                 *buf++ = (char)cc;
 614             len++;
 615         }
 616 #ifndef WC_UTF16
 617         else if (((wxUint32)cc) > 0xffff)
 618         {
 619             // no surrogate pair generation (yet?)
 620             return (size_t)-1;
 621         }
 622 #endif
 623         else
 624         {
 625             if (buf)
 626                 *buf++ = '+';
 627             len++;
 628             if (cc != '+')
 629             {
 630                 // BASE64 encode string
 631                 unsigned int lsb, d, l;
 632                 for (d = 0, l = 0; /*nothing*/; psz++)
 633                 {
 634                     for (lsb = 0; lsb < 2; lsb ++)
 635                     {
 636                         d <<= 8;
 637                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 638
 639                         for (l += 8; l >= 6; )
 640                         {
 641                             l -= 6;
 642                             if (buf)
 643                                 *buf++ = utf7enb64[(d >> l) % 64];
 644                             len++;
 645                         }
 646                     }
 647                     cc = *psz;
 648                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 649                         break;
 650                 }
 651                 if (l != 0)
 652                 {
 653                     if (buf)
 654                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 655                     len++;
 656                 }
 657             }
 658             if (buf)
 659                 *buf++ = '-';
 660             len++;
 661         }
 662     }
 663     if (buf && (len < n))
 664         *buf = 0;
 665     return len;
 666 }
 667
 668 // ----------------------------------------------------------------------------
 669 // UTF-8
 670 // ----------------------------------------------------------------------------
 671
 672 static wxUint32 utf8_max[]=
 673     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 674
 675 // boundaries of the private use area we use to (temporarily) remap invalid
 676 // characters invalid in a UTF-8 encoded string
 677 const wxUint32 wxUnicodePUA = 0x100000;
 678 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 679
 680 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 681 {
 682     size_t len = 0;
 683
 684     while (*psz && ((!buf) || (len < n)))
 685     {
 686         const char *opsz = psz;
 687         bool invalid = false;
 688         unsigned char cc = *psz++, fc = cc;
 689         unsigned cnt;
 690         for (cnt = 0; fc & 0x80; cnt++)
 691             fc <<= 1;
 692         if (!cnt)
 693         {
 694             // plain ASCII char
 695             if (buf)
 696                 *buf++ = cc;
 697             len++;
 698
 699             // escape the escape character for octal escapes
 700             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 701                     && cc == '\\' && (!buf || len < n))
 702             {
 703                 if (buf)
 704                     *buf++ = cc;
 705                 len++;
 706             }
 707         }
 708         else
 709         {
 710             cnt--;
 711             if (!cnt)
 712             {
 713                 // invalid UTF-8 sequence
 714                 invalid = true;
 715             }
 716             else
 717             {
 718                 unsigned ocnt = cnt - 1;
 719                 wxUint32 res = cc & (0x3f >> cnt);
 720                 while (cnt--)
 721                 {
 722                     cc = *psz;
 723                     if ((cc & 0xC0) != 0x80)
 724                     {
 725                         // invalid UTF-8 sequence
 726                         invalid = true;
 727                         break;
 728                     }
 729                     psz++;
 730                     res = (res << 6) | (cc & 0x3f);
 731                 }
 732                 if (invalid || res <= utf8_max[ocnt])
 733                 {
 734                     // illegal UTF-8 encoding
 735                     invalid = true;
 736                 }
 737                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 738                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 739                 {
 740                     // if one of our PUA characters turns up externally
 741                     // it must also be treated as an illegal sequence
 742                     // (a bit like you have to escape an escape character)
 743                     invalid = true;
 744                 }
 745                 else
 746                 {
 747 #ifdef WC_UTF16
 748                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 749                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 750                     if (pa == (size_t)-1)
 751                     {
 752                         invalid = true;
 753                     }
 754                     else
 755                     {
 756                         if (buf)
 757                             buf += pa;
 758                         len += pa;
 759                     }
 760 #else // !WC_UTF16
 761                     if (buf)
 762                         *buf++ = (wchar_t)res;
 763                     len++;
 764 #endif // WC_UTF16/!WC_UTF16
 765                 }
 766             }
 767             if (invalid)
 768             {
 769                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 770                 {
 771                     while (opsz < psz && (!buf || len < n))
 772                     {
 773 #ifdef WC_UTF16
 774                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 775                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 776                         wxASSERT(pa != (size_t)-1);
 777                         if (buf)
 778                             buf += pa;
 779                         opsz++;
 780                         len += pa;
 781 #else
 782                         if (buf)
 783                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 784                         opsz++;
 785                         len++;
 786 #endif
 787                     }
 788                 }
 789                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 790                 {
 791                     while (opsz < psz && (!buf || len < n))
 792                     {
 793                         if ( buf && len + 3 < n )
 794                         {
 795                             unsigned char on = *opsz;
 796                             *buf++ = L'\\';
 797                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 798                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 799                             *buf++ = (wchar_t)( L'0' + on % 010 );
 800                         }
 801                         opsz++;
 802                         len += 4;
 803                     }
 804                 }
 805                 else // MAP_INVALID_UTF8_NOT
 806                 {
 807                     return (size_t)-1;
 808                 }
 809             }
 810         }
 811     }
 812     if (buf && (len < n))
 813         *buf = 0;
 814     return len;
 815 }
 816
 817 static inline bool isoctal(wchar_t wch)
 818 {
 819     return L'0' <= wch && wch <= L'7';
 820 }
 821
 822 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 823 {
 824     size_t len = 0;
 825
 826     while (*psz && ((!buf) || (len < n)))
 827     {
 828         wxUint32 cc;
 829 #ifdef WC_UTF16
 830         // cast is ok for WC_UTF16
 831         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 832         psz += (pa == (size_t)-1) ? 1 : pa;
 833 #else
 834         cc=(*psz++) & 0x7fffffff;
 835 #endif
 836
 837         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 838                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 839         {
 840             if (buf)
 841                 *buf++ = (char)(cc - wxUnicodePUA);
 842             len++;
 843         }
 844         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 845                     && cc == L'\\' && psz[0] == L'\\' )
 846         {
 847             if (buf)
 848                 *buf++ = (char)cc;
 849             psz++;
 850             len++;
 851         }
 852         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 853                     cc == L'\\' &&
 854                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 855         {
 856             if (buf)
 857             {
 858                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 859                                  (psz[1] - L'0')*010 +
 860                                  (psz[2] - L'0'));
 861             }
 862
 863             psz += 3;
 864             len++;
 865         }
 866         else
 867         {
 868             unsigned cnt;
 869             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 870             if (!cnt)
 871             {
 872                 // plain ASCII char
 873                 if (buf)
 874                     *buf++ = (char) cc;
 875                 len++;
 876             }
 877
 878             else
 879             {
 880                 len += cnt + 1;
 881                 if (buf)
 882                 {
 883                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 884                     while (cnt--)
 885                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 886                 }
 887             }
 888         }
 889     }
 890
 891     if (buf && (len<n))
 892         *buf = 0;
 893
 894     return len;
 895 }
 896
 897 // ----------------------------------------------------------------------------
 898 // UTF-16
 899 // ----------------------------------------------------------------------------
 900
 901 #ifdef WORDS_BIGENDIAN
 902     #define wxMBConvUTF16straight wxMBConvUTF16BE
 903     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 904 #else
 905     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 906     #define wxMBConvUTF16straight wxMBConvUTF16LE
 907 #endif
 908
 909
 910 #ifdef WC_UTF16
 911
 912 // copy 16bit MB to 16bit String
 913 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 914 {
 915     size_t len=0;
 916
 917     while (*(wxUint16*)psz && (!buf || len < n))
 918     {
 919         if (buf)
 920             *buf++ = *(wxUint16*)psz;
 921         len++;
 922
 923         psz += sizeof(wxUint16);
 924     }
 925     if (buf && len<n)   *buf=0;
 926
 927     return len;
 928 }
 929
 930
 931 // copy 16bit String to 16bit MB
 932 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 933 {
 934     size_t len=0;
 935
 936     while (*psz && (!buf || len < n))
 937     {
 938         if (buf)
 939         {
 940             *(wxUint16*)buf = *psz;
 941             buf += sizeof(wxUint16);
 942         }
 943         len += sizeof(wxUint16);
 944         psz++;
 945     }
 946     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 947
 948     return len;
 949 }
 950
 951
 952 // swap 16bit MB to 16bit String
 953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 954 {
 955     size_t len = 0;
 956
 957     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 958     // inside the string
 959     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 960     {
 961         if ( buf )
 962         {
 963             ((char *)buf)[0] = psz[1];
 964             ((char *)buf)[1] = psz[0];
 965             buf++;
 966         }
 967         len++;
 968         psz += 2;
 969     }
 970
 971     if ( buf && len < n )
 972         *buf = L'\0';
 973
 974     return len;
 975 }
 976
 977
 978 // swap 16bit MB to 16bit String
 979 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 980 {
 981     size_t len = 0;
 982
 983     while ( *psz && (!buf || len < n) )
 984     {
 985         if ( buf )
 986         {
 987             *buf++ = ((char*)psz)[1];
 988             *buf++ = ((char*)psz)[0];
 989         }
 990         len += 2;
 991         psz++;
 992     }
 993
 994     if ( buf && len < n - 1 )
 995     {
 996         buf[0] =
 997         buf[1] = '\0';
 998     }
 999
1000     return len;
1001 }
1002
1003
1004 #else // WC_UTF16
1005
1006
1007 // copy 16bit MB to 32bit String
1008 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1009 {
1010     size_t len=0;
1011
1012     while (*(wxUint16*)psz && (!buf || len < n))
1013     {
1014         wxUint32 cc;
1015         size_t pa=decode_utf16((wxUint16*)psz, cc);
1016         if (pa == (size_t)-1)
1017             return pa;
1018
1019         if (buf)
1020             *buf++ = (wchar_t)cc;
1021         len++;
1022         psz += pa * sizeof(wxUint16);
1023     }
1024     if (buf && len<n)   *buf=0;
1025
1026     return len;
1027 }
1028
1029
1030 // copy 32bit String to 16bit MB
1031 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1032 {
1033     size_t len=0;
1034
1035     while (*psz && (!buf || len < n))
1036     {
1037         wxUint16 cc[2];
1038         size_t pa=encode_utf16(*psz, cc);
1039
1040         if (pa == (size_t)-1)
1041             return pa;
1042
1043         if (buf)
1044         {
1045             *(wxUint16*)buf = cc[0];
1046             buf += sizeof(wxUint16);
1047             if (pa > 1)
1048             {
1049                 *(wxUint16*)buf = cc[1];
1050                 buf += sizeof(wxUint16);
1051             }
1052         }
1053
1054         len += pa*sizeof(wxUint16);
1055         psz++;
1056     }
1057     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1058
1059     return len;
1060 }
1061
1062
1063 // swap 16bit MB to 32bit String
1064 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1065 {
1066     size_t len=0;
1067
1068     while (*(wxUint16*)psz && (!buf || len < n))
1069     {
1070         wxUint32 cc;
1071         char tmp[4];
1072         tmp[0]=psz[1];  tmp[1]=psz[0];
1073         tmp[2]=psz[3];  tmp[3]=psz[2];
1074
1075         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1076         if (pa == (size_t)-1)
1077             return pa;
1078
1079         if (buf)
1080             *buf++ = (wchar_t)cc;
1081
1082         len++;
1083         psz += pa * sizeof(wxUint16);
1084     }
1085     if (buf && len<n)   *buf=0;
1086
1087     return len;
1088 }
1089
1090
1091 // swap 32bit String to 16bit MB
1092 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1093 {
1094     size_t len=0;
1095
1096     while (*psz && (!buf || len < n))
1097     {
1098         wxUint16 cc[2];
1099         size_t pa=encode_utf16(*psz, cc);
1100
1101         if (pa == (size_t)-1)
1102             return pa;
1103
1104         if (buf)
1105         {
1106             *buf++ = ((char*)cc)[1];
1107             *buf++ = ((char*)cc)[0];
1108             if (pa > 1)
1109             {
1110                 *buf++ = ((char*)cc)[3];
1111                 *buf++ = ((char*)cc)[2];
1112             }
1113         }
1114
1115         len += pa*sizeof(wxUint16);
1116         psz++;
1117     }
1118     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1119
1120     return len;
1121 }
1122
1123 #endif // WC_UTF16
1124
1125
1126 // ----------------------------------------------------------------------------
1127 // UTF-32
1128 // ----------------------------------------------------------------------------
1129
1130 #ifdef WORDS_BIGENDIAN
1131 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1132 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1133 #else
1134 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1135 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1136 #endif
1137
1138
1139 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1140 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1141
1142
1143 #ifdef WC_UTF16
1144
1145 // copy 32bit MB to 16bit String
1146 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1147 {
1148     size_t len=0;
1149
1150     while (*(wxUint32*)psz && (!buf || len < n))
1151     {
1152         wxUint16 cc[2];
1153
1154         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1155         if (pa == (size_t)-1)
1156             return pa;
1157
1158         if (buf)
1159         {
1160             *buf++ = cc[0];
1161             if (pa > 1)
1162                 *buf++ = cc[1];
1163         }
1164         len += pa;
1165         psz += sizeof(wxUint32);
1166     }
1167     if (buf && len<n)   *buf=0;
1168
1169     return len;
1170 }
1171
1172
1173 // copy 16bit String to 32bit MB
1174 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1175 {
1176     size_t len=0;
1177
1178     while (*psz && (!buf || len < n))
1179     {
1180         wxUint32 cc;
1181
1182         // cast is ok for WC_UTF16
1183         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1184         if (pa == (size_t)-1)
1185             return pa;
1186
1187         if (buf)
1188         {
1189             *(wxUint32*)buf = cc;
1190             buf += sizeof(wxUint32);
1191         }
1192         len += sizeof(wxUint32);
1193         psz += pa;
1194     }
1195
1196     if (buf && len<=n-sizeof(wxUint32))
1197         *(wxUint32*)buf=0;
1198
1199     return len;
1200 }
1201
1202
1203
1204 // swap 32bit MB to 16bit String
1205 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1206 {
1207     size_t len=0;
1208
1209     while (*(wxUint32*)psz && (!buf || len < n))
1210     {
1211         char tmp[4];
1212         tmp[0] = psz[3];   tmp[1] = psz[2];
1213         tmp[2] = psz[1];   tmp[3] = psz[0];
1214
1215
1216         wxUint16 cc[2];
1217
1218         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1219         if (pa == (size_t)-1)
1220             return pa;
1221
1222         if (buf)
1223         {
1224             *buf++ = cc[0];
1225             if (pa > 1)
1226                 *buf++ = cc[1];
1227         }
1228         len += pa;
1229         psz += sizeof(wxUint32);
1230     }
1231
1232     if (buf && len<n)
1233         *buf=0;
1234
1235     return len;
1236 }
1237
1238
1239 // swap 16bit String to 32bit MB
1240 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1241 {
1242     size_t len=0;
1243
1244     while (*psz && (!buf || len < n))
1245     {
1246         char cc[4];
1247
1248         // cast is ok for WC_UTF16
1249         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1250         if (pa == (size_t)-1)
1251             return pa;
1252
1253         if (buf)
1254         {
1255             *buf++ = cc[3];
1256             *buf++ = cc[2];
1257             *buf++ = cc[1];
1258             *buf++ = cc[0];
1259         }
1260         len += sizeof(wxUint32);
1261         psz += pa;
1262     }
1263
1264     if (buf && len<=n-sizeof(wxUint32))
1265         *(wxUint32*)buf=0;
1266
1267     return len;
1268 }
1269
1270 #else // WC_UTF16
1271
1272
1273 // copy 32bit MB to 32bit String
1274 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1275 {
1276     size_t len=0;
1277
1278     while (*(wxUint32*)psz && (!buf || len < n))
1279     {
1280         if (buf)
1281             *buf++ = (wchar_t)(*(wxUint32*)psz);
1282         len++;
1283         psz += sizeof(wxUint32);
1284     }
1285
1286     if (buf && len<n)
1287         *buf=0;
1288
1289     return len;
1290 }
1291
1292
1293 // copy 32bit String to 32bit MB
1294 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1295 {
1296     size_t len=0;
1297
1298     while (*psz && (!buf || len < n))
1299     {
1300         if (buf)
1301         {
1302             *(wxUint32*)buf = *psz;
1303             buf += sizeof(wxUint32);
1304         }
1305
1306         len += sizeof(wxUint32);
1307         psz++;
1308     }
1309
1310     if (buf && len<=n-sizeof(wxUint32))
1311         *(wxUint32*)buf=0;
1312
1313     return len;
1314 }
1315
1316
1317 // swap 32bit MB to 32bit String
1318 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1319 {
1320     size_t len=0;
1321
1322     while (*(wxUint32*)psz && (!buf || len < n))
1323     {
1324         if (buf)
1325         {
1326             ((char *)buf)[0] = psz[3];
1327             ((char *)buf)[1] = psz[2];
1328             ((char *)buf)[2] = psz[1];
1329             ((char *)buf)[3] = psz[0];
1330             buf++;
1331         }
1332         len++;
1333         psz += sizeof(wxUint32);
1334     }
1335
1336     if (buf && len<n)
1337         *buf=0;
1338
1339     return len;
1340 }
1341
1342
1343 // swap 32bit String to 32bit MB
1344 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1345 {
1346     size_t len=0;
1347
1348     while (*psz && (!buf || len < n))
1349     {
1350         if (buf)
1351         {
1352             *buf++ = ((char *)psz)[3];
1353             *buf++ = ((char *)psz)[2];
1354             *buf++ = ((char *)psz)[1];
1355             *buf++ = ((char *)psz)[0];
1356         }
1357         len += sizeof(wxUint32);
1358         psz++;
1359     }
1360
1361     if (buf && len<=n-sizeof(wxUint32))
1362         *(wxUint32*)buf=0;
1363
1364     return len;
1365 }
1366
1367
1368 #endif // WC_UTF16
1369
1370
1371 // ============================================================================
1372 // The classes doing conversion using the iconv_xxx() functions
1373 // ============================================================================
1374
1375 #ifdef HAVE_ICONV
1376
1377 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1378 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1379 //     (unless there's yet another bug in glibc) the only case when iconv()
1380 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1381 //     left in the input buffer -- when _real_ error occurs,
1382 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1383 //     iconv() failure.
1384 //     [This bug does not appear in glibc 2.2.]
1385 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1386 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1387                                      (errno != E2BIG || bufLeft != 0))
1388 #else
1389 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1390 #endif
1391
1392 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1393
1394 #define ICONV_T_INVALID ((iconv_t)-1)
1395
1396 #if SIZEOF_WCHAR_T == 4
1397     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1398     #define WC_ENC      wxFONTENCODING_UTF32
1399 #elif SIZEOF_WCHAR_T == 2
1400     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1401     #define WC_ENC      wxFONTENCODING_UTF16
1402 #else // sizeof(wchar_t) != 2 nor 4
1403     // does this ever happen?
1404     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1405 #endif
1406
1407 // ----------------------------------------------------------------------------
1408 // wxMBConv_iconv: encapsulates an iconv character set
1409 // ----------------------------------------------------------------------------
1410
1411 class wxMBConv_iconv : public wxMBConv
1412 {
1413 public:
1414     wxMBConv_iconv(const wxChar *name);
1415     virtual ~wxMBConv_iconv();
1416
1417     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1418     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1419
1420     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1421     virtual size_t GetMBNulLen() const;
1422
1423     virtual wxMBConv *Clone() const
1424     {
1425         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1426         p->m_minMBCharWidth = m_minMBCharWidth;
1427         return p;
1428     }
1429
1430     bool IsOk() const
1431         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1432
1433 protected:
1434     // the iconv handlers used to translate from multibyte to wide char and in
1435     // the other direction
1436     iconv_t m2w,
1437             w2m;
1438 #if wxUSE_THREADS
1439     // guards access to m2w and w2m objects
1440     wxMutex m_iconvMutex;
1441 #endif
1442
1443 private:
1444     // the name (for iconv_open()) of a wide char charset -- if none is
1445     // available on this machine, it will remain NULL
1446     static wxString ms_wcCharsetName;
1447
1448     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1449     // different endian-ness than the native one
1450     static bool ms_wcNeedsSwap;
1451
1452
1453     // name of the encoding handled by this conversion
1454     wxString m_name;
1455
1456     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1457     // initially
1458     size_t m_minMBCharWidth;
1459 };
1460
1461 // make the constructor available for unit testing
1462 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1463 {
1464     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1465     if ( !result->IsOk() )
1466     {
1467         delete result;
1468         return 0;
1469     }
1470     return result;
1471 }
1472
1473 wxString wxMBConv_iconv::ms_wcCharsetName;
1474 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1475
1476 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1477               : m_name(name)
1478 {
1479     m_minMBCharWidth = 0;
1480
1481     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1482     // names for the charsets
1483     const wxCharBuffer cname(wxString(name).ToAscii());
1484
1485     // check for charset that represents wchar_t:
1486     if ( ms_wcCharsetName.empty() )
1487     {
1488         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1489
1490 #if wxUSE_FONTMAP
1491         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1492 #else // !wxUSE_FONTMAP
1493         static const wxChar *names[] =
1494         {
1495 #if SIZEOF_WCHAR_T == 4
1496             _T("UCS-4"),
1497 #elif SIZEOF_WCHAR_T = 2
1498             _T("UCS-2"),
1499 #endif
1500             NULL
1501         };
1502 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1503
1504         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1505         {
1506             const wxString nameCS(*names);
1507
1508             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1509             wxString nameXE(nameCS);
1510             #ifdef WORDS_BIGENDIAN
1511                 nameXE += _T("BE");
1512             #else // little endian
1513                 nameXE += _T("LE");
1514             #endif
1515
1516             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1517                        nameXE.c_str());
1518
1519             m2w = iconv_open(nameXE.ToAscii(), cname);
1520             if ( m2w == ICONV_T_INVALID )
1521             {
1522                 // try charset w/o bytesex info (e.g. "UCS4")
1523                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1524                            nameCS.c_str());
1525                 m2w = iconv_open(nameCS.ToAscii(), cname);
1526
1527                 // and check for bytesex ourselves:
1528                 if ( m2w != ICONV_T_INVALID )
1529                 {
1530                     char    buf[2], *bufPtr;
1531                     wchar_t wbuf[2], *wbufPtr;
1532                     size_t  insz, outsz;
1533                     size_t  res;
1534
1535                     buf[0] = 'A';
1536                     buf[1] = 0;
1537                     wbuf[0] = 0;
1538                     insz = 2;
1539                     outsz = SIZEOF_WCHAR_T * 2;
1540                     wbufPtr = wbuf;
1541                     bufPtr = buf;
1542
1543                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1544                                 (char**)&wbufPtr, &outsz);
1545
1546                     if (ICONV_FAILED(res, insz))
1547                     {
1548                         wxLogLastError(wxT("iconv"));
1549                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1550                                    nameCS.c_str());
1551                     }
1552                     else // ok, can convert to this encoding, remember it
1553                     {
1554                         ms_wcCharsetName = nameCS;
1555                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1556                     }
1557                 }
1558             }
1559             else // use charset not requiring byte swapping
1560             {
1561                 ms_wcCharsetName = nameXE;
1562             }
1563         }
1564
1565         wxLogTrace(TRACE_STRCONV,
1566                    wxT("iconv wchar_t charset is \"%s\"%s"),
1567                    ms_wcCharsetName.empty() ? _T("<none>")
1568                                             : ms_wcCharsetName.c_str(),
1569                    ms_wcNeedsSwap ? _T(" (needs swap)")
1570                                   : _T(""));
1571     }
1572     else // we already have ms_wcCharsetName
1573     {
1574         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1575     }
1576
1577     if ( ms_wcCharsetName.empty() )
1578     {
1579         w2m = ICONV_T_INVALID;
1580     }
1581     else
1582     {
1583         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1584         if ( w2m == ICONV_T_INVALID )
1585         {
1586             wxLogTrace(TRACE_STRCONV,
1587                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1588                        ms_wcCharsetName.c_str(), cname.data());
1589         }
1590     }
1591 }
1592
1593 wxMBConv_iconv::~wxMBConv_iconv()
1594 {
1595     if ( m2w != ICONV_T_INVALID )
1596         iconv_close(m2w);
1597     if ( w2m != ICONV_T_INVALID )
1598         iconv_close(w2m);
1599 }
1600
1601 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1602 {
1603     // find the string length: notice that must be done differently for
1604     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1605     size_t inbuf;
1606     const size_t nulLen = GetMBNulLen();
1607     switch ( nulLen )
1608     {
1609         default:
1610             return (size_t)-1;
1611
1612         case 1:
1613             inbuf = strlen(psz); // arguably more optimized than our version
1614             break;
1615
1616         case 2:
1617         case 4:
1618             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1619             // they also have to start at character boundary and not span two
1620             // adjacent characters
1621             const char *p;
1622             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1623                 ;
1624             inbuf = p - psz;
1625             break;
1626     }
1627
1628 #if wxUSE_THREADS
1629     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1630     //     Unfortunately there is a couple of global wxCSConv objects such as
1631     //     wxConvLocal that are used all over wx code, so we have to make sure
1632     //     the handle is used by at most one thread at the time. Otherwise
1633     //     only a few wx classes would be safe to use from non-main threads
1634     //     as MB<->WC conversion would fail "randomly".
1635     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1636 #endif // wxUSE_THREADS
1637
1638
1639     size_t outbuf = n * SIZEOF_WCHAR_T;
1640     size_t res, cres;
1641     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1642     wchar_t *bufPtr = buf;
1643     const char *pszPtr = psz;
1644
1645     if (buf)
1646     {
1647         // have destination buffer, convert there
1648         cres = iconv(m2w,
1649                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1650                      (char**)&bufPtr, &outbuf);
1651         res = n - (outbuf / SIZEOF_WCHAR_T);
1652
1653         if (ms_wcNeedsSwap)
1654         {
1655             // convert to native endianness
1656             for ( unsigned i = 0; i < res; i++ )
1657                 buf[n] = WC_BSWAP(buf[i]);
1658         }
1659
1660         // NUL-terminate the string if there is any space left
1661         if (res < n)
1662             buf[res] = 0;
1663     }
1664     else
1665     {
1666         // no destination buffer... convert using temp buffer
1667         // to calculate destination buffer requirement
1668         wchar_t tbuf[8];
1669         res = 0;
1670         do {
1671             bufPtr = tbuf;
1672             outbuf = 8*SIZEOF_WCHAR_T;
1673
1674             cres = iconv(m2w,
1675                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1676                          (char**)&bufPtr, &outbuf );
1677
1678             res += 8-(outbuf/SIZEOF_WCHAR_T);
1679         } while ((cres==(size_t)-1) && (errno==E2BIG));
1680     }
1681
1682     if (ICONV_FAILED(cres, inbuf))
1683     {
1684         //VS: it is ok if iconv fails, hence trace only
1685         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1686         return (size_t)-1;
1687     }
1688
1689     return res;
1690 }
1691
1692 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1693 {
1694 #if wxUSE_THREADS
1695     // NB: explained in MB2WC
1696     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1697 #endif
1698
1699     size_t inlen = wxWcslen(psz);
1700     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1701     size_t outbuf = n;
1702     size_t res, cres;
1703
1704     wchar_t *tmpbuf = 0;
1705
1706     if (ms_wcNeedsSwap)
1707     {
1708         // need to copy to temp buffer to switch endianness
1709         // (doing WC_BSWAP twice on the original buffer won't help, as it
1710         //  could be in read-only memory, or be accessed in some other thread)
1711         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1712         for ( size_t i = 0; i < inlen; i++ )
1713             tmpbuf[n] = WC_BSWAP(psz[i]);
1714         tmpbuf[inlen] = L'\0';
1715         psz = tmpbuf;
1716     }
1717
1718     if (buf)
1719     {
1720         // have destination buffer, convert there
1721         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1722
1723         res = n-outbuf;
1724
1725         // NB: iconv was given only wcslen(psz) characters on input, and so
1726         //     it couldn't convert the trailing zero. Let's do it ourselves
1727         //     if there's some room left for it in the output buffer.
1728         if (res < n)
1729             buf[0] = 0;
1730     }
1731     else
1732     {
1733         // no destination buffer... convert using temp buffer
1734         // to calculate destination buffer requirement
1735         char tbuf[16];
1736         res = 0;
1737         do {
1738             buf = tbuf; outbuf = 16;
1739
1740             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1741
1742             res += 16 - outbuf;
1743         } while ((cres==(size_t)-1) && (errno==E2BIG));
1744     }
1745
1746     if (ms_wcNeedsSwap)
1747     {
1748         free(tmpbuf);
1749     }
1750
1751     if (ICONV_FAILED(cres, inbuf))
1752     {
1753         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1754         return (size_t)-1;
1755     }
1756
1757     return res;
1758 }
1759
1760 size_t wxMBConv_iconv::GetMBNulLen() const
1761 {
1762     if ( m_minMBCharWidth == 0 )
1763     {
1764         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1765
1766 #if wxUSE_THREADS
1767         // NB: explained in MB2WC
1768         wxMutexLocker lock(self->m_iconvMutex);
1769 #endif
1770
1771         wchar_t *wnul = L"";
1772         char buf[8]; // should be enough for NUL in any encoding
1773         size_t inLen = sizeof(wchar_t),
1774                outLen = WXSIZEOF(buf);
1775         char *in = (char *)wnul;
1776         char *out = buf;
1777         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1778         {
1779             self->m_minMBCharWidth = (size_t)-1;
1780         }
1781         else // ok
1782         {
1783             self->m_minMBCharWidth = out - buf;
1784         }
1785     }
1786
1787     return m_minMBCharWidth;
1788 }
1789
1790 #endif // HAVE_ICONV
1791
1792
1793 // ============================================================================
1794 // Win32 conversion classes
1795 // ============================================================================
1796
1797 #ifdef wxHAVE_WIN32_MB2WC
1798
1799 // from utils.cpp
1800 #if wxUSE_FONTMAP
1801 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1802 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1803 #endif
1804
1805 class wxMBConv_win32 : public wxMBConv
1806 {
1807 public:
1808     wxMBConv_win32()
1809     {
1810         m_CodePage = CP_ACP;
1811         m_minMBCharWidth = 0;
1812     }
1813
1814     wxMBConv_win32(const wxMBConv_win32& conv)
1815     {
1816         m_CodePage = conv.m_CodePage;
1817         m_minMBCharWidth = conv.m_minMBCharWidth;
1818     }
1819
1820 #if wxUSE_FONTMAP
1821     wxMBConv_win32(const wxChar* name)
1822     {
1823         m_CodePage = wxCharsetToCodepage(name);
1824         m_minMBCharWidth = 0;
1825     }
1826
1827     wxMBConv_win32(wxFontEncoding encoding)
1828     {
1829         m_CodePage = wxEncodingToCodepage(encoding);
1830         m_minMBCharWidth = 0;
1831     }
1832 #endif // wxUSE_FONTMAP
1833
1834     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1835     {
1836         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1837         // the behaviour is not compatible with the Unix version (using iconv)
1838         // and break the library itself, e.g. wxTextInputStream::NextChar()
1839         // wouldn't work if reading an incomplete MB char didn't result in an
1840         // error
1841         //
1842         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1843         // Win XP or newer and it is not supported for UTF-[78] so we always
1844         // use our own conversions in this case. See
1845         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1846         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1847         if ( m_CodePage == CP_UTF8 )
1848         {
1849             return wxConvUTF8.MB2WC(buf, psz, n);
1850         }
1851
1852         if ( m_CodePage == CP_UTF7 )
1853         {
1854             return wxConvUTF7.MB2WC(buf, psz, n);
1855         }
1856
1857         int flags = 0;
1858         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1859                 IsAtLeastWin2kSP4() )
1860         {
1861             flags = MB_ERR_INVALID_CHARS;
1862         }
1863
1864         const size_t len = ::MultiByteToWideChar
1865                              (
1866                                 m_CodePage,     // code page
1867                                 flags,          // flags: fall on error
1868                                 psz,            // input string
1869                                 -1,             // its length (NUL-terminated)
1870                                 buf,            // output string
1871                                 buf ? n : 0     // size of output buffer
1872                              );
1873         if ( !len )
1874         {
1875             // function totally failed
1876             return (size_t)-1;
1877         }
1878
1879         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1880         // check if we succeeded, by doing a double trip:
1881         if ( !flags && buf )
1882         {
1883             const size_t mbLen = strlen(psz);
1884             wxCharBuffer mbBuf(mbLen);
1885             if ( ::WideCharToMultiByte
1886                    (
1887                       m_CodePage,
1888                       0,
1889                       buf,
1890                       -1,
1891                       mbBuf.data(),
1892                       mbLen + 1,        // size in bytes, not length
1893                       NULL,
1894                       NULL
1895                    ) == 0 ||
1896                   strcmp(mbBuf, psz) != 0 )
1897             {
1898                 // we didn't obtain the same thing we started from, hence
1899                 // the conversion was lossy and we consider that it failed
1900                 return (size_t)-1;
1901             }
1902         }
1903
1904         // note that it returns count of written chars for buf != NULL and size
1905         // of the needed buffer for buf == NULL so in either case the length of
1906         // the string (which never includes the terminating NUL) is one less
1907         return len - 1;
1908     }
1909
1910     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1911     {
1912         /*
1913             we have a problem here: by default, WideCharToMultiByte() may
1914             replace characters unrepresentable in the target code page with bad
1915             quality approximations such as turning "1/2" symbol (U+00BD) into
1916             "1" for the code pages which don't have it and we, obviously, want
1917             to avoid this at any price
1918
1919             the trouble is that this function does it _silently_, i.e. it won't
1920             even tell us whether it did or not... Win98/2000 and higher provide
1921             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1922             we have to resort to a round trip, i.e. check that converting back
1923             results in the same string -- this is, of course, expensive but
1924             otherwise we simply can't be sure to not garble the data.
1925          */
1926
1927         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1928         // it doesn't work with CJK encodings (which we test for rather roughly
1929         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1930         // supporting it
1931         BOOL usedDef wxDUMMY_INITIALIZE(false);
1932         BOOL *pUsedDef;
1933         int flags;
1934         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1935         {
1936             // it's our lucky day
1937             flags = WC_NO_BEST_FIT_CHARS;
1938             pUsedDef = &usedDef;
1939         }
1940         else // old system or unsupported encoding
1941         {
1942             flags = 0;
1943             pUsedDef = NULL;
1944         }
1945
1946         const size_t len = ::WideCharToMultiByte
1947                              (
1948                                 m_CodePage,     // code page
1949                                 flags,          // either none or no best fit
1950                                 pwz,            // input string
1951                                 -1,             // it is (wide) NUL-terminated
1952                                 buf,            // output buffer
1953                                 buf ? n : 0,    // and its size
1954                                 NULL,           // default "replacement" char
1955                                 pUsedDef        // [out] was it used?
1956                              );
1957
1958         if ( !len )
1959         {
1960             // function totally failed
1961             return (size_t)-1;
1962         }
1963
1964         // if we were really converting, check if we succeeded
1965         if ( buf )
1966         {
1967             if ( flags )
1968             {
1969                 // check if the conversion failed, i.e. if any replacements
1970                 // were done
1971                 if ( usedDef )
1972                     return (size_t)-1;
1973             }
1974             else // we must resort to double tripping...
1975             {
1976                 wxWCharBuffer wcBuf(n);
1977                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1978                         wcscmp(wcBuf, pwz) != 0 )
1979                 {
1980                     // we didn't obtain the same thing we started from, hence
1981                     // the conversion was lossy and we consider that it failed
1982                     return (size_t)-1;
1983                 }
1984             }
1985         }
1986
1987         // see the comment above for the reason of "len - 1"
1988         return len - 1;
1989     }
1990
1991     virtual size_t GetMBNulLen() const
1992     {
1993         if ( m_minMBCharWidth == 0 )
1994         {
1995             int len = ::WideCharToMultiByte
1996                         (
1997                             m_CodePage,     // code page
1998                             0,              // no flags
1999                             L"",            // input string
2000                             1,              // translate just the NUL
2001                             NULL,           // output buffer
2002                             0,              // and its size
2003                             NULL,           // no replacement char
2004                             NULL            // [out] don't care if it was used
2005                         );
2006
2007             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2008             switch ( len )
2009             {
2010                 default:
2011                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2012                     // fall through
2013
2014                 case 0:
2015                     self->m_minMBCharWidth = (size_t)-1;
2016                     break;
2017
2018                 case 1:
2019                 case 2:
2020                 case 4:
2021                     self->m_minMBCharWidth = len;
2022                     break;
2023             }
2024         }
2025
2026         return m_minMBCharWidth;
2027     }
2028
2029     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2030
2031     bool IsOk() const { return m_CodePage != -1; }
2032
2033 private:
2034     static bool CanUseNoBestFit()
2035     {
2036         static int s_isWin98Or2k = -1;
2037
2038         if ( s_isWin98Or2k == -1 )
2039         {
2040             int verMaj, verMin;
2041             switch ( wxGetOsVersion(&verMaj, &verMin) )
2042             {
2043                 case wxWIN95:
2044                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2045                     break;
2046
2047                 case wxWINDOWS_NT:
2048                     s_isWin98Or2k = verMaj >= 5;
2049                     break;
2050
2051                 default:
2052                     // unknown, be conseravtive by default
2053                     s_isWin98Or2k = 0;
2054             }
2055
2056             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2057         }
2058
2059         return s_isWin98Or2k == 1;
2060     }
2061
2062     static bool IsAtLeastWin2kSP4()
2063     {
2064 #ifdef __WXWINCE__
2065         return false;
2066 #else
2067         static int s_isAtLeastWin2kSP4 = -1;
2068
2069         if ( s_isAtLeastWin2kSP4 == -1 )
2070         {
2071             OSVERSIONINFOEX ver;
2072
2073             memset(&ver, 0, sizeof(ver));
2074             ver.dwOSVersionInfoSize = sizeof(ver);
2075             GetVersionEx((OSVERSIONINFO*)&ver);
2076
2077             s_isAtLeastWin2kSP4 =
2078               ((ver.dwMajorVersion > 5) || // Vista+
2079                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2080                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2081                ver.wServicePackMajor >= 4)) // 2000 SP4+
2082               ? 1 : 0;
2083         }
2084
2085         return s_isAtLeastWin2kSP4 == 1;
2086 #endif
2087     }
2088
2089
2090     // the code page we're working with
2091     long m_CodePage;
2092
2093     // cached result of GetMBNulLen(), set to 0 initially meaning
2094     // "unknown"
2095     size_t m_minMBCharWidth;
2096 };
2097
2098 #endif // wxHAVE_WIN32_MB2WC
2099
2100 // ============================================================================
2101 // Cocoa conversion classes
2102 // ============================================================================
2103
2104 #if defined(__WXCOCOA__)
2105
2106 // RN:  There is no UTF-32 support in either Core Foundation or
2107 // Cocoa.  Strangely enough, internally Core Foundation uses
2108 // UTF 32 internally quite a bit - its just not public (yet).
2109
2110 #include <CoreFoundation/CFString.h>
2111 #include <CoreFoundation/CFStringEncodingExt.h>
2112
2113 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2114 {
2115     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2116     if ( encoding == wxFONTENCODING_DEFAULT )
2117     {
2118         enc = CFStringGetSystemEncoding();
2119     }
2120     else switch( encoding)
2121     {
2122         case wxFONTENCODING_ISO8859_1 :
2123             enc = kCFStringEncodingISOLatin1 ;
2124             break ;
2125         case wxFONTENCODING_ISO8859_2 :
2126             enc = kCFStringEncodingISOLatin2;
2127             break ;
2128         case wxFONTENCODING_ISO8859_3 :
2129             enc = kCFStringEncodingISOLatin3 ;
2130             break ;
2131         case wxFONTENCODING_ISO8859_4 :
2132             enc = kCFStringEncodingISOLatin4;
2133             break ;
2134         case wxFONTENCODING_ISO8859_5 :
2135             enc = kCFStringEncodingISOLatinCyrillic;
2136             break ;
2137         case wxFONTENCODING_ISO8859_6 :
2138             enc = kCFStringEncodingISOLatinArabic;
2139             break ;
2140         case wxFONTENCODING_ISO8859_7 :
2141             enc = kCFStringEncodingISOLatinGreek;
2142             break ;
2143         case wxFONTENCODING_ISO8859_8 :
2144             enc = kCFStringEncodingISOLatinHebrew;
2145             break ;
2146         case wxFONTENCODING_ISO8859_9 :
2147             enc = kCFStringEncodingISOLatin5;
2148             break ;
2149         case wxFONTENCODING_ISO8859_10 :
2150             enc = kCFStringEncodingISOLatin6;
2151             break ;
2152         case wxFONTENCODING_ISO8859_11 :
2153             enc = kCFStringEncodingISOLatinThai;
2154             break ;
2155         case wxFONTENCODING_ISO8859_13 :
2156             enc = kCFStringEncodingISOLatin7;
2157             break ;
2158         case wxFONTENCODING_ISO8859_14 :
2159             enc = kCFStringEncodingISOLatin8;
2160             break ;
2161         case wxFONTENCODING_ISO8859_15 :
2162             enc = kCFStringEncodingISOLatin9;
2163             break ;
2164
2165         case wxFONTENCODING_KOI8 :
2166             enc = kCFStringEncodingKOI8_R;
2167             break ;
2168         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2169             enc = kCFStringEncodingDOSRussian;
2170             break ;
2171
2172 //      case wxFONTENCODING_BULGARIAN :
2173 //          enc = ;
2174 //          break ;
2175
2176         case wxFONTENCODING_CP437 :
2177             enc =kCFStringEncodingDOSLatinUS ;
2178             break ;
2179         case wxFONTENCODING_CP850 :
2180             enc = kCFStringEncodingDOSLatin1;
2181             break ;
2182         case wxFONTENCODING_CP852 :
2183             enc = kCFStringEncodingDOSLatin2;
2184             break ;
2185         case wxFONTENCODING_CP855 :
2186             enc = kCFStringEncodingDOSCyrillic;
2187             break ;
2188         case wxFONTENCODING_CP866 :
2189             enc =kCFStringEncodingDOSRussian ;
2190             break ;
2191         case wxFONTENCODING_CP874 :
2192             enc = kCFStringEncodingDOSThai;
2193             break ;
2194         case wxFONTENCODING_CP932 :
2195             enc = kCFStringEncodingDOSJapanese;
2196             break ;
2197         case wxFONTENCODING_CP936 :
2198             enc =kCFStringEncodingDOSChineseSimplif ;
2199             break ;
2200         case wxFONTENCODING_CP949 :
2201             enc = kCFStringEncodingDOSKorean;
2202             break ;
2203         case wxFONTENCODING_CP950 :
2204             enc = kCFStringEncodingDOSChineseTrad;
2205             break ;
2206         case wxFONTENCODING_CP1250 :
2207             enc = kCFStringEncodingWindowsLatin2;
2208             break ;
2209         case wxFONTENCODING_CP1251 :
2210             enc =kCFStringEncodingWindowsCyrillic ;
2211             break ;
2212         case wxFONTENCODING_CP1252 :
2213             enc =kCFStringEncodingWindowsLatin1 ;
2214             break ;
2215         case wxFONTENCODING_CP1253 :
2216             enc = kCFStringEncodingWindowsGreek;
2217             break ;
2218         case wxFONTENCODING_CP1254 :
2219             enc = kCFStringEncodingWindowsLatin5;
2220             break ;
2221         case wxFONTENCODING_CP1255 :
2222             enc =kCFStringEncodingWindowsHebrew ;
2223             break ;
2224         case wxFONTENCODING_CP1256 :
2225             enc =kCFStringEncodingWindowsArabic ;
2226             break ;
2227         case wxFONTENCODING_CP1257 :
2228             enc = kCFStringEncodingWindowsBalticRim;
2229             break ;
2230 //   This only really encodes to UTF7 (if that) evidently
2231 //        case wxFONTENCODING_UTF7 :
2232 //            enc = kCFStringEncodingNonLossyASCII ;
2233 //            break ;
2234         case wxFONTENCODING_UTF8 :
2235             enc = kCFStringEncodingUTF8 ;
2236             break ;
2237         case wxFONTENCODING_EUC_JP :
2238             enc = kCFStringEncodingEUC_JP;
2239             break ;
2240         case wxFONTENCODING_UTF16 :
2241             enc = kCFStringEncodingUnicode ;
2242             break ;
2243         case wxFONTENCODING_MACROMAN :
2244             enc = kCFStringEncodingMacRoman ;
2245             break ;
2246         case wxFONTENCODING_MACJAPANESE :
2247             enc = kCFStringEncodingMacJapanese ;
2248             break ;
2249         case wxFONTENCODING_MACCHINESETRAD :
2250             enc = kCFStringEncodingMacChineseTrad ;
2251             break ;
2252         case wxFONTENCODING_MACKOREAN :
2253             enc = kCFStringEncodingMacKorean ;
2254             break ;
2255         case wxFONTENCODING_MACARABIC :
2256             enc = kCFStringEncodingMacArabic ;
2257             break ;
2258         case wxFONTENCODING_MACHEBREW :
2259             enc = kCFStringEncodingMacHebrew ;
2260             break ;
2261         case wxFONTENCODING_MACGREEK :
2262             enc = kCFStringEncodingMacGreek ;
2263             break ;
2264         case wxFONTENCODING_MACCYRILLIC :
2265             enc = kCFStringEncodingMacCyrillic ;
2266             break ;
2267         case wxFONTENCODING_MACDEVANAGARI :
2268             enc = kCFStringEncodingMacDevanagari ;
2269             break ;
2270         case wxFONTENCODING_MACGURMUKHI :
2271             enc = kCFStringEncodingMacGurmukhi ;
2272             break ;
2273         case wxFONTENCODING_MACGUJARATI :
2274             enc = kCFStringEncodingMacGujarati ;
2275             break ;
2276         case wxFONTENCODING_MACORIYA :
2277             enc = kCFStringEncodingMacOriya ;
2278             break ;
2279         case wxFONTENCODING_MACBENGALI :
2280             enc = kCFStringEncodingMacBengali ;
2281             break ;
2282         case wxFONTENCODING_MACTAMIL :
2283             enc = kCFStringEncodingMacTamil ;
2284             break ;
2285         case wxFONTENCODING_MACTELUGU :
2286             enc = kCFStringEncodingMacTelugu ;
2287             break ;
2288         case wxFONTENCODING_MACKANNADA :
2289             enc = kCFStringEncodingMacKannada ;
2290             break ;
2291         case wxFONTENCODING_MACMALAJALAM :
2292             enc = kCFStringEncodingMacMalayalam ;
2293             break ;
2294         case wxFONTENCODING_MACSINHALESE :
2295             enc = kCFStringEncodingMacSinhalese ;
2296             break ;
2297         case wxFONTENCODING_MACBURMESE :
2298             enc = kCFStringEncodingMacBurmese ;
2299             break ;
2300         case wxFONTENCODING_MACKHMER :
2301             enc = kCFStringEncodingMacKhmer ;
2302             break ;
2303         case wxFONTENCODING_MACTHAI :
2304             enc = kCFStringEncodingMacThai ;
2305             break ;
2306         case wxFONTENCODING_MACLAOTIAN :
2307             enc = kCFStringEncodingMacLaotian ;
2308             break ;
2309         case wxFONTENCODING_MACGEORGIAN :
2310             enc = kCFStringEncodingMacGeorgian ;
2311             break ;
2312         case wxFONTENCODING_MACARMENIAN :
2313             enc = kCFStringEncodingMacArmenian ;
2314             break ;
2315         case wxFONTENCODING_MACCHINESESIMP :
2316             enc = kCFStringEncodingMacChineseSimp ;
2317             break ;
2318         case wxFONTENCODING_MACTIBETAN :
2319             enc = kCFStringEncodingMacTibetan ;
2320             break ;
2321         case wxFONTENCODING_MACMONGOLIAN :
2322             enc = kCFStringEncodingMacMongolian ;
2323             break ;
2324         case wxFONTENCODING_MACETHIOPIC :
2325             enc = kCFStringEncodingMacEthiopic ;
2326             break ;
2327         case wxFONTENCODING_MACCENTRALEUR :
2328             enc = kCFStringEncodingMacCentralEurRoman ;
2329             break ;
2330         case wxFONTENCODING_MACVIATNAMESE :
2331             enc = kCFStringEncodingMacVietnamese ;
2332             break ;
2333         case wxFONTENCODING_MACARABICEXT :
2334             enc = kCFStringEncodingMacExtArabic ;
2335             break ;
2336         case wxFONTENCODING_MACSYMBOL :
2337             enc = kCFStringEncodingMacSymbol ;
2338             break ;
2339         case wxFONTENCODING_MACDINGBATS :
2340             enc = kCFStringEncodingMacDingbats ;
2341             break ;
2342         case wxFONTENCODING_MACTURKISH :
2343             enc = kCFStringEncodingMacTurkish ;
2344             break ;
2345         case wxFONTENCODING_MACCROATIAN :
2346             enc = kCFStringEncodingMacCroatian ;
2347             break ;
2348         case wxFONTENCODING_MACICELANDIC :
2349             enc = kCFStringEncodingMacIcelandic ;
2350             break ;
2351         case wxFONTENCODING_MACROMANIAN :
2352             enc = kCFStringEncodingMacRomanian ;
2353             break ;
2354         case wxFONTENCODING_MACCELTIC :
2355             enc = kCFStringEncodingMacCeltic ;
2356             break ;
2357         case wxFONTENCODING_MACGAELIC :
2358             enc = kCFStringEncodingMacGaelic ;
2359             break ;
2360 //      case wxFONTENCODING_MACKEYBOARD :
2361 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2362 //          break ;
2363         default :
2364             // because gcc is picky
2365             break ;
2366     } ;
2367     return enc ;
2368 }
2369
2370 class wxMBConv_cocoa : public wxMBConv
2371 {
2372 public:
2373     wxMBConv_cocoa()
2374     {
2375         Init(CFStringGetSystemEncoding()) ;
2376     }
2377
2378     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2379     {
2380         m_encoding = conv.m_encoding;
2381     }
2382
2383 #if wxUSE_FONTMAP
2384     wxMBConv_cocoa(const wxChar* name)
2385     {
2386         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2387     }
2388 #endif
2389
2390     wxMBConv_cocoa(wxFontEncoding encoding)
2391     {
2392         Init( wxCFStringEncFromFontEnc(encoding) );
2393     }
2394
2395     ~wxMBConv_cocoa()
2396     {
2397     }
2398
2399     void Init( CFStringEncoding encoding)
2400     {
2401         m_encoding = encoding ;
2402     }
2403
2404     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2405     {
2406         wxASSERT(szUnConv);
2407
2408         CFStringRef theString = CFStringCreateWithBytes (
2409                                                 NULL, //the allocator
2410                                                 (const UInt8*)szUnConv,
2411                                                 strlen(szUnConv),
2412                                                 m_encoding,
2413                                                 false //no BOM/external representation
2414                                                 );
2415
2416         wxASSERT(theString);
2417
2418         size_t nOutLength = CFStringGetLength(theString);
2419
2420         if (szOut == NULL)
2421         {
2422             CFRelease(theString);
2423             return nOutLength;
2424         }
2425
2426         CFRange theRange = { 0, nOutSize };
2427
2428 #if SIZEOF_WCHAR_T == 4
2429         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2430 #endif
2431
2432         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2433
2434         CFRelease(theString);
2435
2436         szUniCharBuffer[nOutLength] = '\0' ;
2437
2438 #if SIZEOF_WCHAR_T == 4
2439         wxMBConvUTF16 converter ;
2440         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2441         delete[] szUniCharBuffer;
2442 #endif
2443
2444         return nOutLength;
2445     }
2446
2447     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2448     {
2449         wxASSERT(szUnConv);
2450
2451         size_t nRealOutSize;
2452         size_t nBufSize = wxWcslen(szUnConv);
2453         UniChar* szUniBuffer = (UniChar*) szUnConv;
2454
2455 #if SIZEOF_WCHAR_T == 4
2456         wxMBConvUTF16 converter ;
2457         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2458         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2459         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2460         nBufSize /= sizeof(UniChar);
2461 #endif
2462
2463         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2464                                 NULL, //allocator
2465                                 szUniBuffer,
2466                                 nBufSize,
2467                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2468                             );
2469
2470         wxASSERT(theString);
2471
2472         //Note that CER puts a BOM when converting to unicode
2473         //so we  check and use getchars instead in that case
2474         if (m_encoding == kCFStringEncodingUnicode)
2475         {
2476             if (szOut != NULL)
2477                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2478
2479             nRealOutSize = CFStringGetLength(theString) + 1;
2480         }
2481         else
2482         {
2483             CFStringGetBytes(
2484                 theString,
2485                 CFRangeMake(0, CFStringGetLength(theString)),
2486                 m_encoding,
2487                 0, //what to put in characters that can't be converted -
2488                     //0 tells CFString to return NULL if it meets such a character
2489                 false, //not an external representation
2490                 (UInt8*) szOut,
2491                 nOutSize,
2492                 (CFIndex*) &nRealOutSize
2493                         );
2494         }
2495
2496         CFRelease(theString);
2497
2498 #if SIZEOF_WCHAR_T == 4
2499         delete[] szUniBuffer;
2500 #endif
2501
2502         return  nRealOutSize - 1;
2503     }
2504
2505     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2506
2507     bool IsOk() const
2508
2509     bool IsOk() const
2510     {
2511         return m_encoding != kCFStringEncodingInvalidId &&
2512               CFStringIsEncodingAvailable(m_encoding);
2513     }
2514
2515 private:
2516     CFStringEncoding m_encoding ;
2517 };
2518
2519 #endif // defined(__WXCOCOA__)
2520
2521 // ============================================================================
2522 // Mac conversion classes
2523 // ============================================================================
2524
2525 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2526
2527 class wxMBConv_mac : public wxMBConv
2528 {
2529 public:
2530     wxMBConv_mac()
2531     {
2532         Init(CFStringGetSystemEncoding()) ;
2533     }
2534
2535     wxMBConv_mac(const wxMBConv_mac& conv)
2536     {
2537         Init(conv.m_char_encoding);
2538     }
2539
2540 #if wxUSE_FONTMAP
2541     wxMBConv_mac(const wxChar* name)
2542     {
2543         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2544     }
2545 #endif
2546
2547     wxMBConv_mac(wxFontEncoding encoding)
2548     {
2549         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2550     }
2551
2552     ~wxMBConv_mac()
2553     {
2554         OSStatus status = noErr ;
2555         status = TECDisposeConverter(m_MB2WC_converter);
2556         status = TECDisposeConverter(m_WC2MB_converter);
2557     }
2558
2559
2560     void Init( TextEncodingBase encoding)
2561     {
2562         OSStatus status = noErr ;
2563         m_char_encoding = encoding ;
2564         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2565
2566         status = TECCreateConverter(&m_MB2WC_converter,
2567                                     m_char_encoding,
2568                                     m_unicode_encoding);
2569         status = TECCreateConverter(&m_WC2MB_converter,
2570                                     m_unicode_encoding,
2571                                     m_char_encoding);
2572     }
2573
2574     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2575     {
2576         OSStatus status = noErr ;
2577         ByteCount byteOutLen ;
2578         ByteCount byteInLen = strlen(psz) ;
2579         wchar_t *tbuf = NULL ;
2580         UniChar* ubuf = NULL ;
2581         size_t res = 0 ;
2582
2583         if (buf == NULL)
2584         {
2585             //apple specs say at least 32
2586             n = wxMax( 32 , byteInLen ) ;
2587             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2588         }
2589         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2590 #if SIZEOF_WCHAR_T == 4
2591         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2592 #else
2593         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2594 #endif
2595         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2596           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2597 #if SIZEOF_WCHAR_T == 4
2598         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2599         // is not properly terminated we get random characters at the end
2600         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2601         wxMBConvUTF16 converter ;
2602         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2603         free( ubuf ) ;
2604 #else
2605         res = byteOutLen / sizeof( UniChar ) ;
2606 #endif
2607         if ( buf == NULL )
2608              free(tbuf) ;
2609
2610         if ( buf  && res < n)
2611             buf[res] = 0;
2612
2613         return res ;
2614     }
2615
2616     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2617     {
2618         OSStatus status = noErr ;
2619         ByteCount byteOutLen ;
2620         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2621
2622         char *tbuf = NULL ;
2623
2624         if (buf == NULL)
2625         {
2626             //apple specs say at least 32
2627             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2628             tbuf = (char*) malloc( n ) ;
2629         }
2630
2631         ByteCount byteBufferLen = n ;
2632         UniChar* ubuf = NULL ;
2633 #if SIZEOF_WCHAR_T == 4
2634         wxMBConvUTF16 converter ;
2635         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2636         byteInLen = unicharlen ;
2637         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2638         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2639 #else
2640         ubuf = (UniChar*) psz ;
2641 #endif
2642         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2643             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2644 #if SIZEOF_WCHAR_T == 4
2645         free( ubuf ) ;
2646 #endif
2647         if ( buf == NULL )
2648             free(tbuf) ;
2649
2650         size_t res = byteOutLen ;
2651         if ( buf  && res < n)
2652         {
2653             buf[res] = 0;
2654
2655             //we need to double-trip to verify it didn't insert any ? in place
2656             //of bogus characters
2657             wxWCharBuffer wcBuf(n);
2658             size_t pszlen = wxWcslen(psz);
2659             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2660                         wxWcslen(wcBuf) != pszlen ||
2661                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2662             {
2663                 // we didn't obtain the same thing we started from, hence
2664                 // the conversion was lossy and we consider that it failed
2665                 return (size_t)-1;
2666             }
2667         }
2668
2669         return res ;
2670     }
2671
2672     virtual wxMBConv *Clone() const { return wxMBConv_mac(*this); }
2673
2674     bool IsOk() const
2675     bool IsOk() const
2676         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2677
2678 private:
2679     TECObjectRef m_MB2WC_converter ;
2680     TECObjectRef m_WC2MB_converter ;
2681
2682     TextEncodingBase m_char_encoding ;
2683     TextEncodingBase m_unicode_encoding ;
2684 };
2685
2686 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2687
2688 // ============================================================================
2689 // wxEncodingConverter based conversion classes
2690 // ============================================================================
2691
2692 #if wxUSE_FONTMAP
2693
2694 class wxMBConv_wxwin : public wxMBConv
2695 {
2696 private:
2697     void Init()
2698     {
2699         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2700                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2701     }
2702
2703 public:
2704     // temporarily just use wxEncodingConverter stuff,
2705     // so that it works while a better implementation is built
2706     wxMBConv_wxwin(const wxChar* name)
2707     {
2708         if (name)
2709             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2710         else
2711             m_enc = wxFONTENCODING_SYSTEM;
2712
2713         Init();
2714     }
2715
2716     wxMBConv_wxwin(wxFontEncoding enc)
2717     {
2718         m_enc = enc;
2719
2720         Init();
2721     }
2722
2723     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2724     {
2725         size_t inbuf = strlen(psz);
2726         if (buf)
2727         {
2728             if (!m2w.Convert(psz,buf))
2729                 return (size_t)-1;
2730         }
2731         return inbuf;
2732     }
2733
2734     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2735     {
2736         const size_t inbuf = wxWcslen(psz);
2737         if (buf)
2738         {
2739             if (!w2m.Convert(psz,buf))
2740                 return (size_t)-1;
2741         }
2742
2743         return inbuf;
2744     }
2745
2746     virtual size_t GetMBNulLen() const
2747     {
2748         switch ( m_enc )
2749         {
2750             case wxFONTENCODING_UTF16BE:
2751             case wxFONTENCODING_UTF16LE:
2752                 return 2;
2753
2754             case wxFONTENCODING_UTF32BE:
2755             case wxFONTENCODING_UTF32LE:
2756                 return 4;
2757
2758             default:
2759                 return 1;
2760         }
2761     }
2762
2763     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2764
2765     bool IsOk() const { return m_ok; }
2766
2767 public:
2768     wxFontEncoding m_enc;
2769     wxEncodingConverter m2w, w2m;
2770
2771 private:
2772     // were we initialized successfully?
2773     bool m_ok;
2774
2775     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2776 };
2777
2778 // make the constructors available for unit testing
2779 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2780 {
2781     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2782     if ( !result->IsOk() )
2783     {
2784         delete result;
2785         return 0;
2786     }
2787     return result;
2788 }
2789
2790 #endif // wxUSE_FONTMAP
2791
2792 // ============================================================================
2793 // wxCSConv implementation
2794 // ============================================================================
2795
2796 void wxCSConv::Init()
2797 {
2798     m_name = NULL;
2799     m_convReal =  NULL;
2800     m_deferred = true;
2801 }
2802
2803 wxCSConv::wxCSConv(const wxChar *charset)
2804 {
2805     Init();
2806
2807     if ( charset )
2808     {
2809         SetName(charset);
2810     }
2811
2812 #if wxUSE_FONTMAP
2813     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2814 #else
2815     m_encoding = wxFONTENCODING_SYSTEM;
2816 #endif
2817 }
2818
2819 wxCSConv::wxCSConv(wxFontEncoding encoding)
2820 {
2821     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2822     {
2823         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2824
2825         encoding = wxFONTENCODING_SYSTEM;
2826     }
2827
2828     Init();
2829
2830     m_encoding = encoding;
2831 }
2832
2833 wxCSConv::~wxCSConv()
2834 {
2835     Clear();
2836 }
2837
2838 wxCSConv::wxCSConv(const wxCSConv& conv)
2839         : wxMBConv()
2840 {
2841     Init();
2842
2843     SetName(conv.m_name);
2844     m_encoding = conv.m_encoding;
2845 }
2846
2847 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2848 {
2849     Clear();
2850
2851     SetName(conv.m_name);
2852     m_encoding = conv.m_encoding;
2853
2854     return *this;
2855 }
2856
2857 void wxCSConv::Clear()
2858 {
2859     free(m_name);
2860     delete m_convReal;
2861
2862     m_name = NULL;
2863     m_convReal = NULL;
2864 }
2865
2866 void wxCSConv::SetName(const wxChar *charset)
2867 {
2868     if (charset)
2869     {
2870         m_name = wxStrdup(charset);
2871         m_deferred = true;
2872     }
2873 }
2874
2875 #if wxUSE_FONTMAP
2876 #include "wx/hashmap.h"
2877
2878 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2879                      wxEncodingNameCache );
2880
2881 static wxEncodingNameCache gs_nameCache;
2882 #endif
2883
2884 wxMBConv *wxCSConv::DoCreate() const
2885 {
2886 #if wxUSE_FONTMAP
2887     wxLogTrace(TRACE_STRCONV,
2888                wxT("creating conversion for %s"),
2889                (m_name ? m_name
2890                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2891 #endif // wxUSE_FONTMAP
2892
2893     // check for the special case of ASCII or ISO8859-1 charset: as we have
2894     // special knowledge of it anyhow, we don't need to create a special
2895     // conversion object
2896     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2897             m_encoding == wxFONTENCODING_DEFAULT )
2898     {
2899         // don't convert at all
2900         return NULL;
2901     }
2902
2903     // we trust OS to do conversion better than we can so try external
2904     // conversion methods first
2905     //
2906     // the full order is:
2907     //      1. OS conversion (iconv() under Unix or Win32 API)
2908     //      2. hard coded conversions for UTF
2909     //      3. wxEncodingConverter as fall back
2910
2911     // step (1)
2912 #ifdef HAVE_ICONV
2913 #if !wxUSE_FONTMAP
2914     if ( m_name )
2915 #endif // !wxUSE_FONTMAP
2916     {
2917         wxString name(m_name);
2918         wxFontEncoding encoding(m_encoding);
2919
2920         if ( !name.empty() )
2921         {
2922             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2923             if ( conv->IsOk() )
2924                 return conv;
2925
2926             delete conv;
2927
2928 #if wxUSE_FONTMAP
2929             encoding =
2930                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2931 #endif // wxUSE_FONTMAP
2932         }
2933 #if wxUSE_FONTMAP
2934         {
2935             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2936             if ( it != gs_nameCache.end() )
2937             {
2938                 if ( it->second.empty() )
2939                     return NULL;
2940
2941                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2942                 if ( conv->IsOk() )
2943                     return conv;
2944
2945                 delete conv;
2946             }
2947
2948             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2949
2950             for ( ; *names; ++names )
2951             {
2952                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2953                 if ( conv->IsOk() )
2954                 {
2955                     gs_nameCache[encoding] = *names;
2956                     return conv;
2957                 }
2958
2959                 delete conv;
2960             }
2961
2962             gs_nameCache[encoding] = _T(""); // cache the failure
2963         }
2964 #endif // wxUSE_FONTMAP
2965     }
2966 #endif // HAVE_ICONV
2967
2968 #ifdef wxHAVE_WIN32_MB2WC
2969     {
2970 #if wxUSE_FONTMAP
2971         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2972                                       : new wxMBConv_win32(m_encoding);
2973         if ( conv->IsOk() )
2974             return conv;
2975
2976         delete conv;
2977 #else
2978         return NULL;
2979 #endif
2980     }
2981 #endif // wxHAVE_WIN32_MB2WC
2982 #if defined(__WXMAC__)
2983     {
2984         // leave UTF16 and UTF32 to the built-ins of wx
2985         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2986             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2987         {
2988
2989 #if wxUSE_FONTMAP
2990             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2991                                         : new wxMBConv_mac(m_encoding);
2992 #else
2993             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2994 #endif
2995             if ( conv->IsOk() )
2996                  return conv;
2997
2998             delete conv;
2999         }
3000     }
3001 #endif
3002 #if defined(__WXCOCOA__)
3003     {
3004         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3005         {
3006
3007 #if wxUSE_FONTMAP
3008             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3009                                           : new wxMBConv_cocoa(m_encoding);
3010 #else
3011             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3012 #endif
3013             if ( conv->IsOk() )
3014                  return conv;
3015
3016             delete conv;
3017         }
3018     }
3019 #endif
3020     // step (2)
3021     wxFontEncoding enc = m_encoding;
3022 #if wxUSE_FONTMAP
3023     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3024     {
3025         // use "false" to suppress interactive dialogs -- we can be called from
3026         // anywhere and popping up a dialog from here is the last thing we want to
3027         // do
3028         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3029     }
3030 #endif // wxUSE_FONTMAP
3031
3032     switch ( enc )
3033     {
3034         case wxFONTENCODING_UTF7:
3035              return new wxMBConvUTF7;
3036
3037         case wxFONTENCODING_UTF8:
3038              return new wxMBConvUTF8;
3039
3040         case wxFONTENCODING_UTF16BE:
3041              return new wxMBConvUTF16BE;
3042
3043         case wxFONTENCODING_UTF16LE:
3044              return new wxMBConvUTF16LE;
3045
3046         case wxFONTENCODING_UTF32BE:
3047              return new wxMBConvUTF32BE;
3048
3049         case wxFONTENCODING_UTF32LE:
3050              return new wxMBConvUTF32LE;
3051
3052         default:
3053              // nothing to do but put here to suppress gcc warnings
3054              ;
3055     }
3056
3057     // step (3)
3058 #if wxUSE_FONTMAP
3059     {
3060         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3061                                       : new wxMBConv_wxwin(m_encoding);
3062         if ( conv->IsOk() )
3063             return conv;
3064
3065         delete conv;
3066     }
3067 #endif // wxUSE_FONTMAP
3068
3069     // NB: This is a hack to prevent deadlock. What could otherwise happen
3070     //     in Unicode build: wxConvLocal creation ends up being here
3071     //     because of some failure and logs the error. But wxLog will try to
3072     //     attach timestamp, for which it will need wxConvLocal (to convert
3073     //     time to char* and then wchar_t*), but that fails, tries to log
3074     //     error, but wxLog has a (already locked) critical section that
3075     //     guards static buffer.
3076     static bool alreadyLoggingError = false;
3077     if (!alreadyLoggingError)
3078     {
3079         alreadyLoggingError = true;
3080         wxLogError(_("Cannot convert from the charset '%s'!"),
3081                    m_name ? m_name
3082                       :
3083 #if wxUSE_FONTMAP
3084                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3085 #else // !wxUSE_FONTMAP
3086                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3087 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3088               );
3089         alreadyLoggingError = false;
3090     }
3091
3092     return NULL;
3093 }
3094
3095 void wxCSConv::CreateConvIfNeeded() const
3096 {
3097     if ( m_deferred )
3098     {
3099         wxCSConv *self = (wxCSConv *)this; // const_cast
3100
3101 #if wxUSE_INTL
3102         // if we don't have neither the name nor the encoding, use the default
3103         // encoding for this system
3104         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3105         {
3106             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3107         }
3108 #endif // wxUSE_INTL
3109
3110         self->m_convReal = DoCreate();
3111         self->m_deferred = false;
3112     }
3113 }
3114
3115 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3116 {
3117     CreateConvIfNeeded();
3118
3119     if (m_convReal)
3120         return m_convReal->MB2WC(buf, psz, n);
3121
3122     // latin-1 (direct)
3123     size_t len = strlen(psz);
3124
3125     if (buf)
3126     {
3127         for (size_t c = 0; c <= len; c++)
3128             buf[c] = (unsigned char)(psz[c]);
3129     }
3130
3131     return len;
3132 }
3133
3134 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3135 {
3136     CreateConvIfNeeded();
3137
3138     if (m_convReal)
3139         return m_convReal->WC2MB(buf, psz, n);
3140
3141     // latin-1 (direct)
3142     const size_t len = wxWcslen(psz);
3143     if (buf)
3144     {
3145         for (size_t c = 0; c <= len; c++)
3146         {
3147             if (psz[c] > 0xFF)
3148                 return (size_t)-1;
3149             buf[c] = (char)psz[c];
3150         }
3151     }
3152     else
3153     {
3154         for (size_t c = 0; c <= len; c++)
3155         {
3156             if (psz[c] > 0xFF)
3157                 return (size_t)-1;
3158         }
3159     }
3160
3161     return len;
3162 }
3163
3164 size_t wxCSConv::GetMBNulLen() const
3165 {
3166     CreateConvIfNeeded();
3167
3168     if ( m_convReal )
3169     {
3170         return m_convReal->GetMBNulLen();
3171     }
3172
3173     return 1;
3174 }
3175
3176 // ----------------------------------------------------------------------------
3177 // globals
3178 // ----------------------------------------------------------------------------
3179
3180 #ifdef __WINDOWS__
3181     static wxMBConv_win32 wxConvLibcObj;
3182 #elif defined(__WXMAC__) && !defined(__MACH__)
3183     static wxMBConv_mac wxConvLibcObj ;
3184 #else
3185     static wxMBConvLibc wxConvLibcObj;
3186 #endif
3187
3188 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3189 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3190 static wxMBConvUTF7 wxConvUTF7Obj;
3191 static wxMBConvUTF8 wxConvUTF8Obj;
3192
3193 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3194 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3195 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3196 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3197 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3198 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3199 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3200 #ifdef __WXOSX__
3201                                     wxConvUTF8Obj;
3202 #else
3203                                     wxConvLibcObj;
3204 #endif
3205
3206
3207 #else // !wxUSE_WCHAR_T
3208
3209 // stand-ins in absence of wchar_t
3210 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3211                                 wxConvISO8859_1,
3212                                 wxConvLocal,
3213                                 wxConvUTF8;
3214
3215 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T