src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  90 static bool NotAllNULs(const char *p, size_t n)
  91 {
  92     while ( n && *p++ == '\0' )
  93         n--;
  94
  95     return n != 0;
  96 }
  97
  98 // ----------------------------------------------------------------------------
  99 // UTF-16 en/decoding to/from UCS-4
 100 // ----------------------------------------------------------------------------
 101
 102
 103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 104 {
 105     if (input<=0xffff)
 106     {
 107         if (output)
 108             *output = (wxUint16) input;
 109         return 1;
 110     }
 111     else if (input>=0x110000)
 112     {
 113         return (size_t)-1;
 114     }
 115     else
 116     {
 117         if (output)
 118         {
 119             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 120             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 121         }
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input<0xd800) || (*input>0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 134     {
 135         output = *input;
 136         return (size_t)-1;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145
 146 // ----------------------------------------------------------------------------
 147 // wxMBConv
 148 // ----------------------------------------------------------------------------
 149
 150 size_t
 151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 152                   const char *src, size_t srcLen) const
 153 {
 154     // although new conversion classes are supposed to implement this function
 155     // directly, the existins ones only implement the old MB2WC() and so, to
 156     // avoid to have to rewrite all conversion classes at once, we provide a
 157     // default (but not efficient) implementation of this one in terms of the
 158     // old function by copying the input to ensure that it's NUL-terminated and
 159     // then using MB2WC() to convert it
 160
 161     // the number of chars [which would be] written to dst [if it were not NULL]
 162     size_t dstWritten = 0;
 163
 164     // the number of NULs terminating this string
 165     size_t nulLen wxDUMMY_INITIALIZE(0);
 166
 167     // if we were not given the input size we just have to assume that the
 168     // string is properly terminated as we have no way of knowing how long it
 169     // is anyhow, but if we do have the size check whether there are enough
 170     // NULs at the end
 171     wxCharBuffer bufTmp;
 172     const char *srcEnd;
 173     if ( srcLen != (size_t)-1 )
 174     {
 175         // we need to know how to find the end of this string
 176         nulLen = GetMBNulLen();
 177         if ( nulLen == wxCONV_FAILED )
 178             return wxCONV_FAILED;
 179
 180         // if there are enough NULs we can avoid the copy
 181         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 182         {
 183             // make a copy in order to properly NUL-terminate the string
 184             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 185             char * const p = bufTmp.data();
 186             memcpy(p, src, srcLen);
 187             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 188                 *s = '\0';
 189
 190             src = bufTmp;
 191         }
 192
 193         srcEnd = src + srcLen;
 194     }
 195     else // quit after the first loop iteration
 196     {
 197         srcEnd = NULL;
 198     }
 199
 200     for ( ;; )
 201     {
 202         // try to convert the current chunk
 203         size_t lenChunk = MB2WC(NULL, src, 0);
 204         if ( lenChunk == 0 )
 205         {
 206             // nothing left in the input string, conversion succeeded; but
 207             // still account for the trailing NULL
 208             dstWritten++;
 209             break;
 210         }
 211
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for trailing NUL
 216
 217         dstWritten += lenChunk;
 218
 219         if ( dst )
 220         {
 221             if ( dstWritten > dstLen )
 222                 return wxCONV_FAILED;
 223
 224             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 225                 return wxCONV_FAILED;
 226
 227             dst += lenChunk;
 228         }
 229
 230         if ( !srcEnd )
 231         {
 232             // we convert the entire string in this cas, as we suppose that the
 233             // string is NUL-terminated and so srcEnd is not used at all
 234             break;
 235         }
 236
 237         // advance the input pointer past the end of this chunk
 238         while ( NotAllNULs(src, nulLen) )
 239         {
 240             // notice that we must skip over multiple bytes here as we suppose
 241             // that if NUL takes 2 or 4 bytes, then all the other characters do
 242             // too and so if advanced by a single byte we might erroneously
 243             // detect sequences of NUL bytes in the middle of the input
 244             src += nulLen;
 245         }
 246
 247         src += nulLen; // skipping over its terminator as well
 248
 249         // note that ">=" (and not just "==") is needed here as the terminator
 250         // we skipped just above could be inside or just after the buffer
 251         // delimited by inEnd
 252         if ( src >= srcEnd )
 253             break;
 254     }
 255
 256     return dstWritten;
 257 }
 258
 259 size_t
 260 wxMBConv::FromWChar(char *dst, size_t dstLen,
 261                     const wchar_t *src, size_t srcLen) const
 262 {
 263     // the number of chars [which would be] written to dst [if it were not NULL]
 264     size_t dstWritten = 0;
 265
 266     // make a copy of the input string unless it is already properly
 267     // NUL-terminated
 268     //
 269     // if we don't know its length we have no choice but to assume that it is,
 270     // indeed, properly terminated
 271     wxWCharBuffer bufTmp;
 272     if ( srcLen == (size_t)-1 )
 273     {
 274         srcLen = wxWcslen(src) + 1;
 275     }
 276     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 277     {
 278         // make a copy in order to properly NUL-terminate the string
 279         bufTmp = wxWCharBuffer(srcLen);
 280         memcpy(bufTmp.data(), src, srcLen*sizeof(wchar_t));
 281         src = bufTmp;
 282     }
 283
 284     const size_t lenNul = GetMBNulLen();
 285     for ( const wchar_t * const srcEnd = src + srcLen;
 286           src < srcEnd;
 287           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 288     {
 289         // try to convert the current chunk
 290         size_t lenChunk = WC2MB(NULL, src, 0);
 291
 292         if ( lenChunk == wxCONV_FAILED )
 293             return wxCONV_FAILED;
 294
 295         lenChunk += lenNul;
 296         dstWritten += lenChunk;
 297
 298         if ( dst )
 299         {
 300             if ( dstWritten > dstLen )
 301                 return wxCONV_FAILED;
 302
 303             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 304                 return wxCONV_FAILED;
 305
 306             dst += lenChunk;
 307         }
 308     }
 309
 310     return dstWritten;
 311 }
 312
 313 size_t wxMBConv::MB2WC(wchar_t *out, const char *in, size_t outLen) const
 314 {
 315     size_t rc = ToWChar(out, outLen, in);
 316     if ( rc != wxCONV_FAILED )
 317     {
 318         // ToWChar() returns the buffer length, i.e. including the trailing
 319         // NUL, while this method doesn't take it into account
 320         rc--;
 321     }
 322
 323     return rc;
 324 }
 325
 326 size_t wxMBConv::WC2MB(char *out, const wchar_t *in, size_t outLen) const
 327 {
 328     size_t rc = FromWChar(out, outLen, in);
 329     if ( rc != wxCONV_FAILED )
 330     {
 331         rc -= GetMBNulLen();
 332     }
 333
 334     return rc;
 335 }
 336
 337 wxMBConv::~wxMBConv()
 338 {
 339     // nothing to do here (necessary for Darwin linking probably)
 340 }
 341
 342 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 343 {
 344     if ( psz )
 345     {
 346         // calculate the length of the buffer needed first
 347         const size_t nLen = MB2WC(NULL, psz, 0);
 348         if ( nLen != wxCONV_FAILED )
 349         {
 350             // now do the actual conversion
 351             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 352
 353             // +1 for the trailing NULL
 354             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 355                 return buf;
 356         }
 357     }
 358
 359     return wxWCharBuffer();
 360 }
 361
 362 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 363 {
 364     if ( pwz )
 365     {
 366         const size_t nLen = WC2MB(NULL, pwz, 0);
 367         if ( nLen != wxCONV_FAILED )
 368         {
 369             // extra space for trailing NUL(s)
 370             static const size_t extraLen = GetMaxMBNulLen();
 371
 372             wxCharBuffer buf(nLen + extraLen - 1);
 373             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 374                 return buf;
 375         }
 376     }
 377
 378     return wxCharBuffer();
 379 }
 380
 381 const wxWCharBuffer
 382 wxMBConv::cMB2WC(const char *in, size_t inLen, size_t *outLen) const
 383 {
 384     const size_t dstLen = ToWChar(NULL, 0, in, inLen);
 385     if ( dstLen != wxCONV_FAILED )
 386     {
 387         wxWCharBuffer wbuf(dstLen - 1);
 388         if ( ToWChar(wbuf.data(), dstLen, in, inLen) )
 389         {
 390             if ( outLen )
 391                 *outLen = dstLen - 1;
 392             return wbuf;
 393         }
 394     }
 395
 396     if ( outLen )
 397         *outLen = 0;
 398
 399     return wxWCharBuffer();
 400 }
 401
 402 const wxCharBuffer
 403 wxMBConv::cWC2MB(const wchar_t *in, size_t inLen, size_t *outLen) const
 404 {
 405     const size_t dstLen = FromWChar(NULL, 0, in, inLen);
 406     if ( dstLen != wxCONV_FAILED )
 407     {
 408         wxCharBuffer buf(dstLen - 1);
 409         if ( FromWChar(buf.data(), dstLen, in, inLen) )
 410         {
 411             if ( outLen )
 412                 *outLen = dstLen - 1;
 413             return buf;
 414         }
 415     }
 416
 417     if ( outLen )
 418         *outLen = 0;
 419
 420     return wxCharBuffer();
 421 }
 422
 423 // ----------------------------------------------------------------------------
 424 // wxMBConvLibc
 425 // ----------------------------------------------------------------------------
 426
 427 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 428 {
 429     return wxMB2WC(buf, psz, n);
 430 }
 431
 432 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 433 {
 434     return wxWC2MB(buf, psz, n);
 435 }
 436
 437 // ----------------------------------------------------------------------------
 438 // wxConvBrokenFileNames
 439 // ----------------------------------------------------------------------------
 440
 441 #ifdef __UNIX__
 442
 443 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 444 {
 445     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 446                   || wxStricmp(charset, _T("UTF8")) == 0  )
 447         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 448     else
 449         m_conv = new wxCSConv(charset);
 450 }
 451
 452 #endif // __UNIX__
 453
 454 // ----------------------------------------------------------------------------
 455 // UTF-7
 456 // ----------------------------------------------------------------------------
 457
 458 // Implementation (C) 2004 Fredrik Roubert
 459
 460 //
 461 // BASE64 decoding table
 462 //
 463 static const unsigned char utf7unb64[] =
 464 {
 465     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 466     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 467     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 468     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 469     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 470     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 471     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 472     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 473     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 474     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 475     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 476     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 477     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 478     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 479     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 480     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 481     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 482     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 483     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 484     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 485     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 486     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 487     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 488     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 489     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 490     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 497 };
 498
 499 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 500 {
 501     size_t len = 0;
 502
 503     while ( *psz && (!buf || (len < n)) )
 504     {
 505         unsigned char cc = *psz++;
 506         if (cc != '+')
 507         {
 508             // plain ASCII char
 509             if (buf)
 510                 *buf++ = cc;
 511             len++;
 512         }
 513         else if (*psz == '-')
 514         {
 515             // encoded plus sign
 516             if (buf)
 517                 *buf++ = cc;
 518             len++;
 519             psz++;
 520         }
 521         else // start of BASE64 encoded string
 522         {
 523             bool lsb, ok;
 524             unsigned int d, l;
 525             for ( ok = lsb = false, d = 0, l = 0;
 526                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 527                   psz++ )
 528             {
 529                 d <<= 6;
 530                 d += cc;
 531                 for (l += 6; l >= 8; lsb = !lsb)
 532                 {
 533                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 534                     if (lsb)
 535                     {
 536                         if (buf)
 537                             *buf++ |= c;
 538                         len ++;
 539                     }
 540                     else
 541                     {
 542                         if (buf)
 543                             *buf = (wchar_t)(c << 8);
 544                     }
 545
 546                     ok = true;
 547                 }
 548             }
 549
 550             if ( !ok )
 551             {
 552                 // in valid UTF7 we should have valid characters after '+'
 553                 return (size_t)-1;
 554             }
 555
 556             if (*psz == '-')
 557                 psz++;
 558         }
 559     }
 560
 561     if ( buf && (len < n) )
 562         *buf = '\0';
 563
 564     return len;
 565 }
 566
 567 //
 568 // BASE64 encoding table
 569 //
 570 static const unsigned char utf7enb64[] =
 571 {
 572     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 573     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 574     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 575     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 576     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 577     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 578     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 579     '4', '5', '6', '7', '8', '9', '+', '/'
 580 };
 581
 582 //
 583 // UTF-7 encoding table
 584 //
 585 // 0 - Set D (directly encoded characters)
 586 // 1 - Set O (optional direct characters)
 587 // 2 - whitespace characters (optional)
 588 // 3 - special characters
 589 //
 590 static const unsigned char utf7encode[128] =
 591 {
 592     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 593     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 594     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 595     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 596     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 597     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 598     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 599     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 600 };
 601
 602 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 603 {
 604     size_t len = 0;
 605
 606     while (*psz && ((!buf) || (len < n)))
 607     {
 608         wchar_t cc = *psz++;
 609         if (cc < 0x80 && utf7encode[cc] < 1)
 610         {
 611             // plain ASCII char
 612             if (buf)
 613                 *buf++ = (char)cc;
 614             len++;
 615         }
 616 #ifndef WC_UTF16
 617         else if (((wxUint32)cc) > 0xffff)
 618         {
 619             // no surrogate pair generation (yet?)
 620             return (size_t)-1;
 621         }
 622 #endif
 623         else
 624         {
 625             if (buf)
 626                 *buf++ = '+';
 627             len++;
 628             if (cc != '+')
 629             {
 630                 // BASE64 encode string
 631                 unsigned int lsb, d, l;
 632                 for (d = 0, l = 0; /*nothing*/; psz++)
 633                 {
 634                     for (lsb = 0; lsb < 2; lsb ++)
 635                     {
 636                         d <<= 8;
 637                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 638
 639                         for (l += 8; l >= 6; )
 640                         {
 641                             l -= 6;
 642                             if (buf)
 643                                 *buf++ = utf7enb64[(d >> l) % 64];
 644                             len++;
 645                         }
 646                     }
 647                     cc = *psz;
 648                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 649                         break;
 650                 }
 651                 if (l != 0)
 652                 {
 653                     if (buf)
 654                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 655                     len++;
 656                 }
 657             }
 658             if (buf)
 659                 *buf++ = '-';
 660             len++;
 661         }
 662     }
 663     if (buf && (len < n))
 664         *buf = 0;
 665     return len;
 666 }
 667
 668 // ----------------------------------------------------------------------------
 669 // UTF-8
 670 // ----------------------------------------------------------------------------
 671
 672 static wxUint32 utf8_max[]=
 673     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 674
 675 // boundaries of the private use area we use to (temporarily) remap invalid
 676 // characters invalid in a UTF-8 encoded string
 677 const wxUint32 wxUnicodePUA = 0x100000;
 678 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 679
 680 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 681 {
 682     size_t len = 0;
 683
 684     while (*psz && ((!buf) || (len < n)))
 685     {
 686         const char *opsz = psz;
 687         bool invalid = false;
 688         unsigned char cc = *psz++, fc = cc;
 689         unsigned cnt;
 690         for (cnt = 0; fc & 0x80; cnt++)
 691             fc <<= 1;
 692         if (!cnt)
 693         {
 694             // plain ASCII char
 695             if (buf)
 696                 *buf++ = cc;
 697             len++;
 698
 699             // escape the escape character for octal escapes
 700             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 701                     && cc == '\\' && (!buf || len < n))
 702             {
 703                 if (buf)
 704                     *buf++ = cc;
 705                 len++;
 706             }
 707         }
 708         else
 709         {
 710             cnt--;
 711             if (!cnt)
 712             {
 713                 // invalid UTF-8 sequence
 714                 invalid = true;
 715             }
 716             else
 717             {
 718                 unsigned ocnt = cnt - 1;
 719                 wxUint32 res = cc & (0x3f >> cnt);
 720                 while (cnt--)
 721                 {
 722                     cc = *psz;
 723                     if ((cc & 0xC0) != 0x80)
 724                     {
 725                         // invalid UTF-8 sequence
 726                         invalid = true;
 727                         break;
 728                     }
 729                     psz++;
 730                     res = (res << 6) | (cc & 0x3f);
 731                 }
 732                 if (invalid || res <= utf8_max[ocnt])
 733                 {
 734                     // illegal UTF-8 encoding
 735                     invalid = true;
 736                 }
 737                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 738                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 739                 {
 740                     // if one of our PUA characters turns up externally
 741                     // it must also be treated as an illegal sequence
 742                     // (a bit like you have to escape an escape character)
 743                     invalid = true;
 744                 }
 745                 else
 746                 {
 747 #ifdef WC_UTF16
 748                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 749                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 750                     if (pa == (size_t)-1)
 751                     {
 752                         invalid = true;
 753                     }
 754                     else
 755                     {
 756                         if (buf)
 757                             buf += pa;
 758                         len += pa;
 759                     }
 760 #else // !WC_UTF16
 761                     if (buf)
 762                         *buf++ = (wchar_t)res;
 763                     len++;
 764 #endif // WC_UTF16/!WC_UTF16
 765                 }
 766             }
 767             if (invalid)
 768             {
 769                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 770                 {
 771                     while (opsz < psz && (!buf || len < n))
 772                     {
 773 #ifdef WC_UTF16
 774                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 775                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 776                         wxASSERT(pa != (size_t)-1);
 777                         if (buf)
 778                             buf += pa;
 779                         opsz++;
 780                         len += pa;
 781 #else
 782                         if (buf)
 783                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 784                         opsz++;
 785                         len++;
 786 #endif
 787                     }
 788                 }
 789                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 790                 {
 791                     while (opsz < psz && (!buf || len < n))
 792                     {
 793                         if ( buf && len + 3 < n )
 794                         {
 795                             unsigned char on = *opsz;
 796                             *buf++ = L'\\';
 797                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 798                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 799                             *buf++ = (wchar_t)( L'0' + on % 010 );
 800                         }
 801                         opsz++;
 802                         len += 4;
 803                     }
 804                 }
 805                 else // MAP_INVALID_UTF8_NOT
 806                 {
 807                     return (size_t)-1;
 808                 }
 809             }
 810         }
 811     }
 812     if (buf && (len < n))
 813         *buf = 0;
 814     return len;
 815 }
 816
 817 static inline bool isoctal(wchar_t wch)
 818 {
 819     return L'0' <= wch && wch <= L'7';
 820 }
 821
 822 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 823 {
 824     size_t len = 0;
 825
 826     while (*psz && ((!buf) || (len < n)))
 827     {
 828         wxUint32 cc;
 829 #ifdef WC_UTF16
 830         // cast is ok for WC_UTF16
 831         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 832         psz += (pa == (size_t)-1) ? 1 : pa;
 833 #else
 834         cc=(*psz++) & 0x7fffffff;
 835 #endif
 836
 837         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 838                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 839         {
 840             if (buf)
 841                 *buf++ = (char)(cc - wxUnicodePUA);
 842             len++;
 843         }
 844         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 845                     && cc == L'\\' && psz[0] == L'\\' )
 846         {
 847             if (buf)
 848                 *buf++ = (char)cc;
 849             psz++;
 850             len++;
 851         }
 852         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 853                     cc == L'\\' &&
 854                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 855         {
 856             if (buf)
 857             {
 858                 *buf++ = (char) ((psz[0] - L'0')*0100 +
 859                                  (psz[1] - L'0')*010 +
 860                                  (psz[2] - L'0'));
 861             }
 862
 863             psz += 3;
 864             len++;
 865         }
 866         else
 867         {
 868             unsigned cnt;
 869             for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 870             if (!cnt)
 871             {
 872                 // plain ASCII char
 873                 if (buf)
 874                     *buf++ = (char) cc;
 875                 len++;
 876             }
 877
 878             else
 879             {
 880                 len += cnt + 1;
 881                 if (buf)
 882                 {
 883                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 884                     while (cnt--)
 885                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 886                 }
 887             }
 888         }
 889     }
 890
 891     if (buf && (len<n))
 892         *buf = 0;
 893
 894     return len;
 895 }
 896
 897 // ----------------------------------------------------------------------------
 898 // UTF-16
 899 // ----------------------------------------------------------------------------
 900
 901 #ifdef WORDS_BIGENDIAN
 902     #define wxMBConvUTF16straight wxMBConvUTF16BE
 903     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 904 #else
 905     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 906     #define wxMBConvUTF16straight wxMBConvUTF16LE
 907 #endif
 908
 909
 910 #ifdef WC_UTF16
 911
 912 // copy 16bit MB to 16bit String
 913 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 914 {
 915     size_t len=0;
 916
 917     while (*(wxUint16*)psz && (!buf || len < n))
 918     {
 919         if (buf)
 920             *buf++ = *(wxUint16*)psz;
 921         len++;
 922
 923         psz += sizeof(wxUint16);
 924     }
 925     if (buf && len<n)   *buf=0;
 926
 927     return len;
 928 }
 929
 930
 931 // copy 16bit String to 16bit MB
 932 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 933 {
 934     size_t len=0;
 935
 936     while (*psz && (!buf || len < n))
 937     {
 938         if (buf)
 939         {
 940             *(wxUint16*)buf = *psz;
 941             buf += sizeof(wxUint16);
 942         }
 943         len += sizeof(wxUint16);
 944         psz++;
 945     }
 946     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 947
 948     return len;
 949 }
 950
 951
 952 // swap 16bit MB to 16bit String
 953 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 954 {
 955     size_t len = 0;
 956
 957     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 958     // inside the string
 959     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 960     {
 961         if ( buf )
 962         {
 963             ((char *)buf)[0] = psz[1];
 964             ((char *)buf)[1] = psz[0];
 965             buf++;
 966         }
 967         len++;
 968         psz += 2;
 969     }
 970
 971     if ( buf && len < n )
 972         *buf = L'\0';
 973
 974     return len;
 975 }
 976
 977
 978 // swap 16bit MB to 16bit String
 979 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 980 {
 981     size_t len = 0;
 982
 983     while ( *psz && (!buf || len < n) )
 984     {
 985         if ( buf )
 986         {
 987             *buf++ = ((char*)psz)[1];
 988             *buf++ = ((char*)psz)[0];
 989         }
 990         len += 2;
 991         psz++;
 992     }
 993
 994     if ( buf && len < n - 1 )
 995     {
 996         buf[0] =
 997         buf[1] = '\0';
 998     }
 999
1000     return len;
1001 }
1002
1003
1004 #else // WC_UTF16
1005
1006
1007 // copy 16bit MB to 32bit String
1008 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1009 {
1010     size_t len=0;
1011
1012     while (*(wxUint16*)psz && (!buf || len < n))
1013     {
1014         wxUint32 cc;
1015         size_t pa=decode_utf16((wxUint16*)psz, cc);
1016         if (pa == (size_t)-1)
1017             return pa;
1018
1019         if (buf)
1020             *buf++ = (wchar_t)cc;
1021         len++;
1022         psz += pa * sizeof(wxUint16);
1023     }
1024     if (buf && len<n)   *buf=0;
1025
1026     return len;
1027 }
1028
1029
1030 // copy 32bit String to 16bit MB
1031 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1032 {
1033     size_t len=0;
1034
1035     while (*psz && (!buf || len < n))
1036     {
1037         wxUint16 cc[2];
1038         size_t pa=encode_utf16(*psz, cc);
1039
1040         if (pa == (size_t)-1)
1041             return pa;
1042
1043         if (buf)
1044         {
1045             *(wxUint16*)buf = cc[0];
1046             buf += sizeof(wxUint16);
1047             if (pa > 1)
1048             {
1049                 *(wxUint16*)buf = cc[1];
1050                 buf += sizeof(wxUint16);
1051             }
1052         }
1053
1054         len += pa*sizeof(wxUint16);
1055         psz++;
1056     }
1057     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1058
1059     return len;
1060 }
1061
1062
1063 // swap 16bit MB to 32bit String
1064 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1065 {
1066     size_t len=0;
1067
1068     while (*(wxUint16*)psz && (!buf || len < n))
1069     {
1070         wxUint32 cc;
1071         char tmp[4];
1072         tmp[0]=psz[1];  tmp[1]=psz[0];
1073         tmp[2]=psz[3];  tmp[3]=psz[2];
1074
1075         size_t pa=decode_utf16((wxUint16*)tmp, cc);
1076         if (pa == (size_t)-1)
1077             return pa;
1078
1079         if (buf)
1080             *buf++ = (wchar_t)cc;
1081
1082         len++;
1083         psz += pa * sizeof(wxUint16);
1084     }
1085     if (buf && len<n)   *buf=0;
1086
1087     return len;
1088 }
1089
1090
1091 // swap 32bit String to 16bit MB
1092 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1093 {
1094     size_t len=0;
1095
1096     while (*psz && (!buf || len < n))
1097     {
1098         wxUint16 cc[2];
1099         size_t pa=encode_utf16(*psz, cc);
1100
1101         if (pa == (size_t)-1)
1102             return pa;
1103
1104         if (buf)
1105         {
1106             *buf++ = ((char*)cc)[1];
1107             *buf++ = ((char*)cc)[0];
1108             if (pa > 1)
1109             {
1110                 *buf++ = ((char*)cc)[3];
1111                 *buf++ = ((char*)cc)[2];
1112             }
1113         }
1114
1115         len += pa*sizeof(wxUint16);
1116         psz++;
1117     }
1118     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
1119
1120     return len;
1121 }
1122
1123 #endif // WC_UTF16
1124
1125
1126 // ----------------------------------------------------------------------------
1127 // UTF-32
1128 // ----------------------------------------------------------------------------
1129
1130 #ifdef WORDS_BIGENDIAN
1131 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1132 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1133 #else
1134 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1135 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1136 #endif
1137
1138
1139 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1140 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1141
1142
1143 #ifdef WC_UTF16
1144
1145 // copy 32bit MB to 16bit String
1146 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1147 {
1148     size_t len=0;
1149
1150     while (*(wxUint32*)psz && (!buf || len < n))
1151     {
1152         wxUint16 cc[2];
1153
1154         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
1155         if (pa == (size_t)-1)
1156             return pa;
1157
1158         if (buf)
1159         {
1160             *buf++ = cc[0];
1161             if (pa > 1)
1162                 *buf++ = cc[1];
1163         }
1164         len += pa;
1165         psz += sizeof(wxUint32);
1166     }
1167     if (buf && len<n)   *buf=0;
1168
1169     return len;
1170 }
1171
1172
1173 // copy 16bit String to 32bit MB
1174 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1175 {
1176     size_t len=0;
1177
1178     while (*psz && (!buf || len < n))
1179     {
1180         wxUint32 cc;
1181
1182         // cast is ok for WC_UTF16
1183         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1184         if (pa == (size_t)-1)
1185             return pa;
1186
1187         if (buf)
1188         {
1189             *(wxUint32*)buf = cc;
1190             buf += sizeof(wxUint32);
1191         }
1192         len += sizeof(wxUint32);
1193         psz += pa;
1194     }
1195
1196     if (buf && len<=n-sizeof(wxUint32))
1197         *(wxUint32*)buf=0;
1198
1199     return len;
1200 }
1201
1202
1203
1204 // swap 32bit MB to 16bit String
1205 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1206 {
1207     size_t len=0;
1208
1209     while (*(wxUint32*)psz && (!buf || len < n))
1210     {
1211         char tmp[4];
1212         tmp[0] = psz[3];   tmp[1] = psz[2];
1213         tmp[2] = psz[1];   tmp[3] = psz[0];
1214
1215
1216         wxUint16 cc[2];
1217
1218         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1219         if (pa == (size_t)-1)
1220             return pa;
1221
1222         if (buf)
1223         {
1224             *buf++ = cc[0];
1225             if (pa > 1)
1226                 *buf++ = cc[1];
1227         }
1228         len += pa;
1229         psz += sizeof(wxUint32);
1230     }
1231
1232     if (buf && len<n)
1233         *buf=0;
1234
1235     return len;
1236 }
1237
1238
1239 // swap 16bit String to 32bit MB
1240 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1241 {
1242     size_t len=0;
1243
1244     while (*psz && (!buf || len < n))
1245     {
1246         char cc[4];
1247
1248         // cast is ok for WC_UTF16
1249         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1250         if (pa == (size_t)-1)
1251             return pa;
1252
1253         if (buf)
1254         {
1255             *buf++ = cc[3];
1256             *buf++ = cc[2];
1257             *buf++ = cc[1];
1258             *buf++ = cc[0];
1259         }
1260         len += sizeof(wxUint32);
1261         psz += pa;
1262     }
1263
1264     if (buf && len<=n-sizeof(wxUint32))
1265         *(wxUint32*)buf=0;
1266
1267     return len;
1268 }
1269
1270 #else // WC_UTF16
1271
1272
1273 // copy 32bit MB to 32bit String
1274 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1275 {
1276     size_t len=0;
1277
1278     while (*(wxUint32*)psz && (!buf || len < n))
1279     {
1280         if (buf)
1281             *buf++ = (wchar_t)(*(wxUint32*)psz);
1282         len++;
1283         psz += sizeof(wxUint32);
1284     }
1285
1286     if (buf && len<n)
1287         *buf=0;
1288
1289     return len;
1290 }
1291
1292
1293 // copy 32bit String to 32bit MB
1294 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1295 {
1296     size_t len=0;
1297
1298     while (*psz && (!buf || len < n))
1299     {
1300         if (buf)
1301         {
1302             *(wxUint32*)buf = *psz;
1303             buf += sizeof(wxUint32);
1304         }
1305
1306         len += sizeof(wxUint32);
1307         psz++;
1308     }
1309
1310     if (buf && len<=n-sizeof(wxUint32))
1311         *(wxUint32*)buf=0;
1312
1313     return len;
1314 }
1315
1316
1317 // swap 32bit MB to 32bit String
1318 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1319 {
1320     size_t len=0;
1321
1322     while (*(wxUint32*)psz && (!buf || len < n))
1323     {
1324         if (buf)
1325         {
1326             ((char *)buf)[0] = psz[3];
1327             ((char *)buf)[1] = psz[2];
1328             ((char *)buf)[2] = psz[1];
1329             ((char *)buf)[3] = psz[0];
1330             buf++;
1331         }
1332         len++;
1333         psz += sizeof(wxUint32);
1334     }
1335
1336     if (buf && len<n)
1337         *buf=0;
1338
1339     return len;
1340 }
1341
1342
1343 // swap 32bit String to 32bit MB
1344 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1345 {
1346     size_t len=0;
1347
1348     while (*psz && (!buf || len < n))
1349     {
1350         if (buf)
1351         {
1352             *buf++ = ((char *)psz)[3];
1353             *buf++ = ((char *)psz)[2];
1354             *buf++ = ((char *)psz)[1];
1355             *buf++ = ((char *)psz)[0];
1356         }
1357         len += sizeof(wxUint32);
1358         psz++;
1359     }
1360
1361     if (buf && len<=n-sizeof(wxUint32))
1362         *(wxUint32*)buf=0;
1363
1364     return len;
1365 }
1366
1367
1368 #endif // WC_UTF16
1369
1370
1371 // ============================================================================
1372 // The classes doing conversion using the iconv_xxx() functions
1373 // ============================================================================
1374
1375 #ifdef HAVE_ICONV
1376
1377 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1378 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1379 //     (unless there's yet another bug in glibc) the only case when iconv()
1380 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1381 //     left in the input buffer -- when _real_ error occurs,
1382 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1383 //     iconv() failure.
1384 //     [This bug does not appear in glibc 2.2.]
1385 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1386 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1387                                      (errno != E2BIG || bufLeft != 0))
1388 #else
1389 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1390 #endif
1391
1392 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1393
1394 #define ICONV_T_INVALID ((iconv_t)-1)
1395
1396 #if SIZEOF_WCHAR_T == 4
1397     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1398     #define WC_ENC      wxFONTENCODING_UTF32
1399 #elif SIZEOF_WCHAR_T == 2
1400     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1401     #define WC_ENC      wxFONTENCODING_UTF16
1402 #else // sizeof(wchar_t) != 2 nor 4
1403     // does this ever happen?
1404     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1405 #endif
1406
1407 // ----------------------------------------------------------------------------
1408 // wxMBConv_iconv: encapsulates an iconv character set
1409 // ----------------------------------------------------------------------------
1410
1411 class wxMBConv_iconv : public wxMBConv
1412 {
1413 public:
1414     wxMBConv_iconv(const wxChar *name);
1415     virtual ~wxMBConv_iconv();
1416
1417     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1418     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1419
1420     // classify this encoding as explained in wxMBConv::GetMBNulLen()
1421     // comment
1422     virtual size_t GetMBNulLen() const;
1423
1424     bool IsOk() const
1425         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1426
1427 protected:
1428     // the iconv handlers used to translate from multibyte to wide char and in
1429     // the other direction
1430     iconv_t m2w,
1431             w2m;
1432 #if wxUSE_THREADS
1433     // guards access to m2w and w2m objects
1434     wxMutex m_iconvMutex;
1435 #endif
1436
1437 private:
1438     // the name (for iconv_open()) of a wide char charset -- if none is
1439     // available on this machine, it will remain NULL
1440     static wxString ms_wcCharsetName;
1441
1442     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1443     // different endian-ness than the native one
1444     static bool ms_wcNeedsSwap;
1445
1446     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1447     // initially
1448     size_t m_minMBCharWidth;
1449 };
1450
1451 // make the constructor available for unit testing
1452 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1453 {
1454     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1455     if ( !result->IsOk() )
1456     {
1457         delete result;
1458         return 0;
1459     }
1460     return result;
1461 }
1462
1463 wxString wxMBConv_iconv::ms_wcCharsetName;
1464 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1465
1466 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1467 {
1468     m_minMBCharWidth = 0;
1469
1470     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1471     // names for the charsets
1472     const wxCharBuffer cname(wxString(name).ToAscii());
1473
1474     // check for charset that represents wchar_t:
1475     if ( ms_wcCharsetName.empty() )
1476     {
1477         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1478
1479 #if wxUSE_FONTMAP
1480         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1481 #else // !wxUSE_FONTMAP
1482         static const wxChar *names[] =
1483         {
1484 #if SIZEOF_WCHAR_T == 4
1485             _T("UCS-4"),
1486 #elif SIZEOF_WCHAR_T = 2
1487             _T("UCS-2"),
1488 #endif
1489             NULL
1490         };
1491 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1492
1493         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1494         {
1495             const wxString nameCS(*names);
1496
1497             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1498             wxString nameXE(nameCS);
1499             #ifdef WORDS_BIGENDIAN
1500                 nameXE += _T("BE");
1501             #else // little endian
1502                 nameXE += _T("LE");
1503             #endif
1504
1505             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1506                        nameXE.c_str());
1507
1508             m2w = iconv_open(nameXE.ToAscii(), cname);
1509             if ( m2w == ICONV_T_INVALID )
1510             {
1511                 // try charset w/o bytesex info (e.g. "UCS4")
1512                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1513                            nameCS.c_str());
1514                 m2w = iconv_open(nameCS.ToAscii(), cname);
1515
1516                 // and check for bytesex ourselves:
1517                 if ( m2w != ICONV_T_INVALID )
1518                 {
1519                     char    buf[2], *bufPtr;
1520                     wchar_t wbuf[2], *wbufPtr;
1521                     size_t  insz, outsz;
1522                     size_t  res;
1523
1524                     buf[0] = 'A';
1525                     buf[1] = 0;
1526                     wbuf[0] = 0;
1527                     insz = 2;
1528                     outsz = SIZEOF_WCHAR_T * 2;
1529                     wbufPtr = wbuf;
1530                     bufPtr = buf;
1531
1532                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1533                                 (char**)&wbufPtr, &outsz);
1534
1535                     if (ICONV_FAILED(res, insz))
1536                     {
1537                         wxLogLastError(wxT("iconv"));
1538                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1539                                    nameCS.c_str());
1540                     }
1541                     else // ok, can convert to this encoding, remember it
1542                     {
1543                         ms_wcCharsetName = nameCS;
1544                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1545                     }
1546                 }
1547             }
1548             else // use charset not requiring byte swapping
1549             {
1550                 ms_wcCharsetName = nameXE;
1551             }
1552         }
1553
1554         wxLogTrace(TRACE_STRCONV,
1555                    wxT("iconv wchar_t charset is \"%s\"%s"),
1556                    ms_wcCharsetName.empty() ? _T("<none>")
1557                                             : ms_wcCharsetName.c_str(),
1558                    ms_wcNeedsSwap ? _T(" (needs swap)")
1559                                   : _T(""));
1560     }
1561     else // we already have ms_wcCharsetName
1562     {
1563         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1564     }
1565
1566     if ( ms_wcCharsetName.empty() )
1567     {
1568         w2m = ICONV_T_INVALID;
1569     }
1570     else
1571     {
1572         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1573         if ( w2m == ICONV_T_INVALID )
1574         {
1575             wxLogTrace(TRACE_STRCONV,
1576                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1577                        ms_wcCharsetName.c_str(), cname.data());
1578         }
1579     }
1580 }
1581
1582 wxMBConv_iconv::~wxMBConv_iconv()
1583 {
1584     if ( m2w != ICONV_T_INVALID )
1585         iconv_close(m2w);
1586     if ( w2m != ICONV_T_INVALID )
1587         iconv_close(w2m);
1588 }
1589
1590 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1591 {
1592     // find the string length: notice that must be done differently for
1593     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1594     size_t inbuf;
1595     const size_t nulLen = GetMBNulLen();
1596     switch ( nulLen )
1597     {
1598         default:
1599             return (size_t)-1;
1600
1601         case 1:
1602             inbuf = strlen(psz); // arguably more optimized than our version
1603             break;
1604
1605         case 2:
1606         case 4:
1607             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1608             // they also have to start at character boundary and not span two
1609             // adjacent characters
1610             const char *p;
1611             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1612                 ;
1613             inbuf = p - psz;
1614             break;
1615     }
1616
1617 #if wxUSE_THREADS
1618     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1619     //     Unfortunately there is a couple of global wxCSConv objects such as
1620     //     wxConvLocal that are used all over wx code, so we have to make sure
1621     //     the handle is used by at most one thread at the time. Otherwise
1622     //     only a few wx classes would be safe to use from non-main threads
1623     //     as MB<->WC conversion would fail "randomly".
1624     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1625 #endif // wxUSE_THREADS
1626
1627
1628     size_t outbuf = n * SIZEOF_WCHAR_T;
1629     size_t res, cres;
1630     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1631     wchar_t *bufPtr = buf;
1632     const char *pszPtr = psz;
1633
1634     if (buf)
1635     {
1636         // have destination buffer, convert there
1637         cres = iconv(m2w,
1638                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1639                      (char**)&bufPtr, &outbuf);
1640         res = n - (outbuf / SIZEOF_WCHAR_T);
1641
1642         if (ms_wcNeedsSwap)
1643         {
1644             // convert to native endianness
1645             for ( unsigned i = 0; i < res; i++ )
1646                 buf[n] = WC_BSWAP(buf[i]);
1647         }
1648
1649         // NUL-terminate the string if there is any space left
1650         if (res < n)
1651             buf[res] = 0;
1652     }
1653     else
1654     {
1655         // no destination buffer... convert using temp buffer
1656         // to calculate destination buffer requirement
1657         wchar_t tbuf[8];
1658         res = 0;
1659         do {
1660             bufPtr = tbuf;
1661             outbuf = 8*SIZEOF_WCHAR_T;
1662
1663             cres = iconv(m2w,
1664                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1665                          (char**)&bufPtr, &outbuf );
1666
1667             res += 8-(outbuf/SIZEOF_WCHAR_T);
1668         } while ((cres==(size_t)-1) && (errno==E2BIG));
1669     }
1670
1671     if (ICONV_FAILED(cres, inbuf))
1672     {
1673         //VS: it is ok if iconv fails, hence trace only
1674         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1675         return (size_t)-1;
1676     }
1677
1678     return res;
1679 }
1680
1681 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1682 {
1683 #if wxUSE_THREADS
1684     // NB: explained in MB2WC
1685     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1686 #endif
1687
1688     size_t inlen = wxWcslen(psz);
1689     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1690     size_t outbuf = n;
1691     size_t res, cres;
1692
1693     wchar_t *tmpbuf = 0;
1694
1695     if (ms_wcNeedsSwap)
1696     {
1697         // need to copy to temp buffer to switch endianness
1698         // (doing WC_BSWAP twice on the original buffer won't help, as it
1699         //  could be in read-only memory, or be accessed in some other thread)
1700         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1701         for ( size_t i = 0; i < inlen; i++ )
1702             tmpbuf[n] = WC_BSWAP(psz[i]);
1703         tmpbuf[inlen] = L'\0';
1704         psz = tmpbuf;
1705     }
1706
1707     if (buf)
1708     {
1709         // have destination buffer, convert there
1710         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1711
1712         res = n-outbuf;
1713
1714         // NB: iconv was given only wcslen(psz) characters on input, and so
1715         //     it couldn't convert the trailing zero. Let's do it ourselves
1716         //     if there's some room left for it in the output buffer.
1717         if (res < n)
1718             buf[0] = 0;
1719     }
1720     else
1721     {
1722         // no destination buffer... convert using temp buffer
1723         // to calculate destination buffer requirement
1724         char tbuf[16];
1725         res = 0;
1726         do {
1727             buf = tbuf; outbuf = 16;
1728
1729             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1730
1731             res += 16 - outbuf;
1732         } while ((cres==(size_t)-1) && (errno==E2BIG));
1733     }
1734
1735     if (ms_wcNeedsSwap)
1736     {
1737         free(tmpbuf);
1738     }
1739
1740     if (ICONV_FAILED(cres, inbuf))
1741     {
1742         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1743         return (size_t)-1;
1744     }
1745
1746     return res;
1747 }
1748
1749 size_t wxMBConv_iconv::GetMBNulLen() const
1750 {
1751     if ( m_minMBCharWidth == 0 )
1752     {
1753         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1754
1755 #if wxUSE_THREADS
1756         // NB: explained in MB2WC
1757         wxMutexLocker lock(self->m_iconvMutex);
1758 #endif
1759
1760         wchar_t *wnul = L"";
1761         char buf[8]; // should be enough for NUL in any encoding
1762         size_t inLen = sizeof(wchar_t),
1763                outLen = WXSIZEOF(buf);
1764         char *in = (char *)wnul;
1765         char *out = buf;
1766         if ( iconv(w2m, ICONV_CHAR_CAST(&in), &inLen, &out, &outLen) == (size_t)-1 )
1767         {
1768             self->m_minMBCharWidth = (size_t)-1;
1769         }
1770         else // ok
1771         {
1772             self->m_minMBCharWidth = out - buf;
1773         }
1774     }
1775
1776     return m_minMBCharWidth;
1777 }
1778
1779 #endif // HAVE_ICONV
1780
1781
1782 // ============================================================================
1783 // Win32 conversion classes
1784 // ============================================================================
1785
1786 #ifdef wxHAVE_WIN32_MB2WC
1787
1788 // from utils.cpp
1789 #if wxUSE_FONTMAP
1790 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1791 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1792 #endif
1793
1794 class wxMBConv_win32 : public wxMBConv
1795 {
1796 public:
1797     wxMBConv_win32()
1798     {
1799         m_CodePage = CP_ACP;
1800         m_minMBCharWidth = 0;
1801     }
1802
1803 #if wxUSE_FONTMAP
1804     wxMBConv_win32(const wxChar* name)
1805     {
1806         m_CodePage = wxCharsetToCodepage(name);
1807         m_minMBCharWidth = 0;
1808     }
1809
1810     wxMBConv_win32(wxFontEncoding encoding)
1811     {
1812         m_CodePage = wxEncodingToCodepage(encoding);
1813         m_minMBCharWidth = 0;
1814     }
1815 #endif // wxUSE_FONTMAP
1816
1817     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1818     {
1819         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1820         // the behaviour is not compatible with the Unix version (using iconv)
1821         // and break the library itself, e.g. wxTextInputStream::NextChar()
1822         // wouldn't work if reading an incomplete MB char didn't result in an
1823         // error
1824         //
1825         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1826         // Win XP or newer and it is not supported for UTF-[78] so we always
1827         // use our own conversions in this case. See
1828         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1829         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1830         if ( m_CodePage == CP_UTF8 )
1831         {
1832             return wxConvUTF8.MB2WC(buf, psz, n);
1833         }
1834
1835         if ( m_CodePage == CP_UTF7 )
1836         {
1837             return wxConvUTF7.MB2WC(buf, psz, n);
1838         }
1839
1840         int flags = 0;
1841         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1842                 IsAtLeastWin2kSP4() )
1843         {
1844             flags = MB_ERR_INVALID_CHARS;
1845         }
1846
1847         const size_t len = ::MultiByteToWideChar
1848                              (
1849                                 m_CodePage,     // code page
1850                                 flags,          // flags: fall on error
1851                                 psz,            // input string
1852                                 -1,             // its length (NUL-terminated)
1853                                 buf,            // output string
1854                                 buf ? n : 0     // size of output buffer
1855                              );
1856         if ( !len )
1857         {
1858             // function totally failed
1859             return (size_t)-1;
1860         }
1861
1862         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1863         // check if we succeeded, by doing a double trip:
1864         if ( !flags && buf )
1865         {
1866             const size_t mbLen = strlen(psz);
1867             wxCharBuffer mbBuf(mbLen);
1868             if ( ::WideCharToMultiByte
1869                    (
1870                       m_CodePage,
1871                       0,
1872                       buf,
1873                       -1,
1874                       mbBuf.data(),
1875                       mbLen + 1,        // size in bytes, not length
1876                       NULL,
1877                       NULL
1878                    ) == 0 ||
1879                   strcmp(mbBuf, psz) != 0 )
1880             {
1881                 // we didn't obtain the same thing we started from, hence
1882                 // the conversion was lossy and we consider that it failed
1883                 return (size_t)-1;
1884             }
1885         }
1886
1887         // note that it returns count of written chars for buf != NULL and size
1888         // of the needed buffer for buf == NULL so in either case the length of
1889         // the string (which never includes the terminating NUL) is one less
1890         return len - 1;
1891     }
1892
1893     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1894     {
1895         /*
1896             we have a problem here: by default, WideCharToMultiByte() may
1897             replace characters unrepresentable in the target code page with bad
1898             quality approximations such as turning "1/2" symbol (U+00BD) into
1899             "1" for the code pages which don't have it and we, obviously, want
1900             to avoid this at any price
1901
1902             the trouble is that this function does it _silently_, i.e. it won't
1903             even tell us whether it did or not... Win98/2000 and higher provide
1904             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1905             we have to resort to a round trip, i.e. check that converting back
1906             results in the same string -- this is, of course, expensive but
1907             otherwise we simply can't be sure to not garble the data.
1908          */
1909
1910         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1911         // it doesn't work with CJK encodings (which we test for rather roughly
1912         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1913         // supporting it
1914         BOOL usedDef wxDUMMY_INITIALIZE(false);
1915         BOOL *pUsedDef;
1916         int flags;
1917         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1918         {
1919             // it's our lucky day
1920             flags = WC_NO_BEST_FIT_CHARS;
1921             pUsedDef = &usedDef;
1922         }
1923         else // old system or unsupported encoding
1924         {
1925             flags = 0;
1926             pUsedDef = NULL;
1927         }
1928
1929         const size_t len = ::WideCharToMultiByte
1930                              (
1931                                 m_CodePage,     // code page
1932                                 flags,          // either none or no best fit
1933                                 pwz,            // input string
1934                                 -1,             // it is (wide) NUL-terminated
1935                                 buf,            // output buffer
1936                                 buf ? n : 0,    // and its size
1937                                 NULL,           // default "replacement" char
1938                                 pUsedDef        // [out] was it used?
1939                              );
1940
1941         if ( !len )
1942         {
1943             // function totally failed
1944             return (size_t)-1;
1945         }
1946
1947         // if we were really converting, check if we succeeded
1948         if ( buf )
1949         {
1950             if ( flags )
1951             {
1952                 // check if the conversion failed, i.e. if any replacements
1953                 // were done
1954                 if ( usedDef )
1955                     return (size_t)-1;
1956             }
1957             else // we must resort to double tripping...
1958             {
1959                 wxWCharBuffer wcBuf(n);
1960                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1961                         wcscmp(wcBuf, pwz) != 0 )
1962                 {
1963                     // we didn't obtain the same thing we started from, hence
1964                     // the conversion was lossy and we consider that it failed
1965                     return (size_t)-1;
1966                 }
1967             }
1968         }
1969
1970         // see the comment above for the reason of "len - 1"
1971         return len - 1;
1972     }
1973
1974     virtual size_t GetMBNulLen() const
1975     {
1976         if ( m_minMBCharWidth == 0 )
1977         {
1978             int len = ::WideCharToMultiByte
1979                         (
1980                             m_CodePage,     // code page
1981                             0,              // no flags
1982                             L"",            // input string
1983                             1,              // translate just the NUL
1984                             NULL,           // output buffer
1985                             0,              // and its size
1986                             NULL,           // no replacement char
1987                             NULL            // [out] don't care if it was used
1988                         );
1989
1990             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
1991             switch ( len )
1992             {
1993                 default:
1994                     wxLogDebug(_T("Unexpected NUL length %d"), len);
1995                     // fall through
1996
1997                 case 0:
1998                     self->m_minMBCharWidth = (size_t)-1;
1999                     break;
2000
2001                 case 1:
2002                 case 2:
2003                 case 4:
2004                     self->m_minMBCharWidth = len;
2005                     break;
2006             }
2007         }
2008
2009         return m_minMBCharWidth;
2010     }
2011
2012     bool IsOk() const { return m_CodePage != -1; }
2013
2014 private:
2015     static bool CanUseNoBestFit()
2016     {
2017         static int s_isWin98Or2k = -1;
2018
2019         if ( s_isWin98Or2k == -1 )
2020         {
2021             int verMaj, verMin;
2022             switch ( wxGetOsVersion(&verMaj, &verMin) )
2023             {
2024                 case wxWIN95:
2025                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2026                     break;
2027
2028                 case wxWINDOWS_NT:
2029                     s_isWin98Or2k = verMaj >= 5;
2030                     break;
2031
2032                 default:
2033                     // unknown, be conseravtive by default
2034                     s_isWin98Or2k = 0;
2035             }
2036
2037             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2038         }
2039
2040         return s_isWin98Or2k == 1;
2041     }
2042
2043     static bool IsAtLeastWin2kSP4()
2044     {
2045 #ifdef __WXWINCE__
2046         return false;
2047 #else
2048         static int s_isAtLeastWin2kSP4 = -1;
2049
2050         if ( s_isAtLeastWin2kSP4 == -1 )
2051         {
2052             OSVERSIONINFOEX ver;
2053
2054             memset(&ver, 0, sizeof(ver));
2055             ver.dwOSVersionInfoSize = sizeof(ver);
2056             GetVersionEx((OSVERSIONINFO*)&ver);
2057
2058             s_isAtLeastWin2kSP4 =
2059               ((ver.dwMajorVersion > 5) || // Vista+
2060                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2061                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2062                ver.wServicePackMajor >= 4)) // 2000 SP4+
2063               ? 1 : 0;
2064         }
2065
2066         return s_isAtLeastWin2kSP4 == 1;
2067 #endif
2068     }
2069
2070
2071     // the code page we're working with
2072     long m_CodePage;
2073
2074     // cached result of GetMBNulLen(), set to 0 initially meaning
2075     // "unknown"
2076     size_t m_minMBCharWidth;
2077 };
2078
2079 #endif // wxHAVE_WIN32_MB2WC
2080
2081 // ============================================================================
2082 // Cocoa conversion classes
2083 // ============================================================================
2084
2085 #if defined(__WXCOCOA__)
2086
2087 // RN:  There is no UTF-32 support in either Core Foundation or
2088 // Cocoa.  Strangely enough, internally Core Foundation uses
2089 // UTF 32 internally quite a bit - its just not public (yet).
2090
2091 #include <CoreFoundation/CFString.h>
2092 #include <CoreFoundation/CFStringEncodingExt.h>
2093
2094 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2095 {
2096     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2097     if ( encoding == wxFONTENCODING_DEFAULT )
2098     {
2099         enc = CFStringGetSystemEncoding();
2100     }
2101     else switch( encoding)
2102     {
2103         case wxFONTENCODING_ISO8859_1 :
2104             enc = kCFStringEncodingISOLatin1 ;
2105             break ;
2106         case wxFONTENCODING_ISO8859_2 :
2107             enc = kCFStringEncodingISOLatin2;
2108             break ;
2109         case wxFONTENCODING_ISO8859_3 :
2110             enc = kCFStringEncodingISOLatin3 ;
2111             break ;
2112         case wxFONTENCODING_ISO8859_4 :
2113             enc = kCFStringEncodingISOLatin4;
2114             break ;
2115         case wxFONTENCODING_ISO8859_5 :
2116             enc = kCFStringEncodingISOLatinCyrillic;
2117             break ;
2118         case wxFONTENCODING_ISO8859_6 :
2119             enc = kCFStringEncodingISOLatinArabic;
2120             break ;
2121         case wxFONTENCODING_ISO8859_7 :
2122             enc = kCFStringEncodingISOLatinGreek;
2123             break ;
2124         case wxFONTENCODING_ISO8859_8 :
2125             enc = kCFStringEncodingISOLatinHebrew;
2126             break ;
2127         case wxFONTENCODING_ISO8859_9 :
2128             enc = kCFStringEncodingISOLatin5;
2129             break ;
2130         case wxFONTENCODING_ISO8859_10 :
2131             enc = kCFStringEncodingISOLatin6;
2132             break ;
2133         case wxFONTENCODING_ISO8859_11 :
2134             enc = kCFStringEncodingISOLatinThai;
2135             break ;
2136         case wxFONTENCODING_ISO8859_13 :
2137             enc = kCFStringEncodingISOLatin7;
2138             break ;
2139         case wxFONTENCODING_ISO8859_14 :
2140             enc = kCFStringEncodingISOLatin8;
2141             break ;
2142         case wxFONTENCODING_ISO8859_15 :
2143             enc = kCFStringEncodingISOLatin9;
2144             break ;
2145
2146         case wxFONTENCODING_KOI8 :
2147             enc = kCFStringEncodingKOI8_R;
2148             break ;
2149         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2150             enc = kCFStringEncodingDOSRussian;
2151             break ;
2152
2153 //      case wxFONTENCODING_BULGARIAN :
2154 //          enc = ;
2155 //          break ;
2156
2157         case wxFONTENCODING_CP437 :
2158             enc =kCFStringEncodingDOSLatinUS ;
2159             break ;
2160         case wxFONTENCODING_CP850 :
2161             enc = kCFStringEncodingDOSLatin1;
2162             break ;
2163         case wxFONTENCODING_CP852 :
2164             enc = kCFStringEncodingDOSLatin2;
2165             break ;
2166         case wxFONTENCODING_CP855 :
2167             enc = kCFStringEncodingDOSCyrillic;
2168             break ;
2169         case wxFONTENCODING_CP866 :
2170             enc =kCFStringEncodingDOSRussian ;
2171             break ;
2172         case wxFONTENCODING_CP874 :
2173             enc = kCFStringEncodingDOSThai;
2174             break ;
2175         case wxFONTENCODING_CP932 :
2176             enc = kCFStringEncodingDOSJapanese;
2177             break ;
2178         case wxFONTENCODING_CP936 :
2179             enc =kCFStringEncodingDOSChineseSimplif ;
2180             break ;
2181         case wxFONTENCODING_CP949 :
2182             enc = kCFStringEncodingDOSKorean;
2183             break ;
2184         case wxFONTENCODING_CP950 :
2185             enc = kCFStringEncodingDOSChineseTrad;
2186             break ;
2187         case wxFONTENCODING_CP1250 :
2188             enc = kCFStringEncodingWindowsLatin2;
2189             break ;
2190         case wxFONTENCODING_CP1251 :
2191             enc =kCFStringEncodingWindowsCyrillic ;
2192             break ;
2193         case wxFONTENCODING_CP1252 :
2194             enc =kCFStringEncodingWindowsLatin1 ;
2195             break ;
2196         case wxFONTENCODING_CP1253 :
2197             enc = kCFStringEncodingWindowsGreek;
2198             break ;
2199         case wxFONTENCODING_CP1254 :
2200             enc = kCFStringEncodingWindowsLatin5;
2201             break ;
2202         case wxFONTENCODING_CP1255 :
2203             enc =kCFStringEncodingWindowsHebrew ;
2204             break ;
2205         case wxFONTENCODING_CP1256 :
2206             enc =kCFStringEncodingWindowsArabic ;
2207             break ;
2208         case wxFONTENCODING_CP1257 :
2209             enc = kCFStringEncodingWindowsBalticRim;
2210             break ;
2211 //   This only really encodes to UTF7 (if that) evidently
2212 //        case wxFONTENCODING_UTF7 :
2213 //            enc = kCFStringEncodingNonLossyASCII ;
2214 //            break ;
2215         case wxFONTENCODING_UTF8 :
2216             enc = kCFStringEncodingUTF8 ;
2217             break ;
2218         case wxFONTENCODING_EUC_JP :
2219             enc = kCFStringEncodingEUC_JP;
2220             break ;
2221         case wxFONTENCODING_UTF16 :
2222             enc = kCFStringEncodingUnicode ;
2223             break ;
2224         case wxFONTENCODING_MACROMAN :
2225             enc = kCFStringEncodingMacRoman ;
2226             break ;
2227         case wxFONTENCODING_MACJAPANESE :
2228             enc = kCFStringEncodingMacJapanese ;
2229             break ;
2230         case wxFONTENCODING_MACCHINESETRAD :
2231             enc = kCFStringEncodingMacChineseTrad ;
2232             break ;
2233         case wxFONTENCODING_MACKOREAN :
2234             enc = kCFStringEncodingMacKorean ;
2235             break ;
2236         case wxFONTENCODING_MACARABIC :
2237             enc = kCFStringEncodingMacArabic ;
2238             break ;
2239         case wxFONTENCODING_MACHEBREW :
2240             enc = kCFStringEncodingMacHebrew ;
2241             break ;
2242         case wxFONTENCODING_MACGREEK :
2243             enc = kCFStringEncodingMacGreek ;
2244             break ;
2245         case wxFONTENCODING_MACCYRILLIC :
2246             enc = kCFStringEncodingMacCyrillic ;
2247             break ;
2248         case wxFONTENCODING_MACDEVANAGARI :
2249             enc = kCFStringEncodingMacDevanagari ;
2250             break ;
2251         case wxFONTENCODING_MACGURMUKHI :
2252             enc = kCFStringEncodingMacGurmukhi ;
2253             break ;
2254         case wxFONTENCODING_MACGUJARATI :
2255             enc = kCFStringEncodingMacGujarati ;
2256             break ;
2257         case wxFONTENCODING_MACORIYA :
2258             enc = kCFStringEncodingMacOriya ;
2259             break ;
2260         case wxFONTENCODING_MACBENGALI :
2261             enc = kCFStringEncodingMacBengali ;
2262             break ;
2263         case wxFONTENCODING_MACTAMIL :
2264             enc = kCFStringEncodingMacTamil ;
2265             break ;
2266         case wxFONTENCODING_MACTELUGU :
2267             enc = kCFStringEncodingMacTelugu ;
2268             break ;
2269         case wxFONTENCODING_MACKANNADA :
2270             enc = kCFStringEncodingMacKannada ;
2271             break ;
2272         case wxFONTENCODING_MACMALAJALAM :
2273             enc = kCFStringEncodingMacMalayalam ;
2274             break ;
2275         case wxFONTENCODING_MACSINHALESE :
2276             enc = kCFStringEncodingMacSinhalese ;
2277             break ;
2278         case wxFONTENCODING_MACBURMESE :
2279             enc = kCFStringEncodingMacBurmese ;
2280             break ;
2281         case wxFONTENCODING_MACKHMER :
2282             enc = kCFStringEncodingMacKhmer ;
2283             break ;
2284         case wxFONTENCODING_MACTHAI :
2285             enc = kCFStringEncodingMacThai ;
2286             break ;
2287         case wxFONTENCODING_MACLAOTIAN :
2288             enc = kCFStringEncodingMacLaotian ;
2289             break ;
2290         case wxFONTENCODING_MACGEORGIAN :
2291             enc = kCFStringEncodingMacGeorgian ;
2292             break ;
2293         case wxFONTENCODING_MACARMENIAN :
2294             enc = kCFStringEncodingMacArmenian ;
2295             break ;
2296         case wxFONTENCODING_MACCHINESESIMP :
2297             enc = kCFStringEncodingMacChineseSimp ;
2298             break ;
2299         case wxFONTENCODING_MACTIBETAN :
2300             enc = kCFStringEncodingMacTibetan ;
2301             break ;
2302         case wxFONTENCODING_MACMONGOLIAN :
2303             enc = kCFStringEncodingMacMongolian ;
2304             break ;
2305         case wxFONTENCODING_MACETHIOPIC :
2306             enc = kCFStringEncodingMacEthiopic ;
2307             break ;
2308         case wxFONTENCODING_MACCENTRALEUR :
2309             enc = kCFStringEncodingMacCentralEurRoman ;
2310             break ;
2311         case wxFONTENCODING_MACVIATNAMESE :
2312             enc = kCFStringEncodingMacVietnamese ;
2313             break ;
2314         case wxFONTENCODING_MACARABICEXT :
2315             enc = kCFStringEncodingMacExtArabic ;
2316             break ;
2317         case wxFONTENCODING_MACSYMBOL :
2318             enc = kCFStringEncodingMacSymbol ;
2319             break ;
2320         case wxFONTENCODING_MACDINGBATS :
2321             enc = kCFStringEncodingMacDingbats ;
2322             break ;
2323         case wxFONTENCODING_MACTURKISH :
2324             enc = kCFStringEncodingMacTurkish ;
2325             break ;
2326         case wxFONTENCODING_MACCROATIAN :
2327             enc = kCFStringEncodingMacCroatian ;
2328             break ;
2329         case wxFONTENCODING_MACICELANDIC :
2330             enc = kCFStringEncodingMacIcelandic ;
2331             break ;
2332         case wxFONTENCODING_MACROMANIAN :
2333             enc = kCFStringEncodingMacRomanian ;
2334             break ;
2335         case wxFONTENCODING_MACCELTIC :
2336             enc = kCFStringEncodingMacCeltic ;
2337             break ;
2338         case wxFONTENCODING_MACGAELIC :
2339             enc = kCFStringEncodingMacGaelic ;
2340             break ;
2341 //      case wxFONTENCODING_MACKEYBOARD :
2342 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2343 //          break ;
2344         default :
2345             // because gcc is picky
2346             break ;
2347     } ;
2348     return enc ;
2349 }
2350
2351 class wxMBConv_cocoa : public wxMBConv
2352 {
2353 public:
2354     wxMBConv_cocoa()
2355     {
2356         Init(CFStringGetSystemEncoding()) ;
2357     }
2358
2359 #if wxUSE_FONTMAP
2360     wxMBConv_cocoa(const wxChar* name)
2361     {
2362         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2363     }
2364 #endif
2365
2366     wxMBConv_cocoa(wxFontEncoding encoding)
2367     {
2368         Init( wxCFStringEncFromFontEnc(encoding) );
2369     }
2370
2371     ~wxMBConv_cocoa()
2372     {
2373     }
2374
2375     void Init( CFStringEncoding encoding)
2376     {
2377         m_encoding = encoding ;
2378     }
2379
2380     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2381     {
2382         wxASSERT(szUnConv);
2383
2384         CFStringRef theString = CFStringCreateWithBytes (
2385                                                 NULL, //the allocator
2386                                                 (const UInt8*)szUnConv,
2387                                                 strlen(szUnConv),
2388                                                 m_encoding,
2389                                                 false //no BOM/external representation
2390                                                 );
2391
2392         wxASSERT(theString);
2393
2394         size_t nOutLength = CFStringGetLength(theString);
2395
2396         if (szOut == NULL)
2397         {
2398             CFRelease(theString);
2399             return nOutLength;
2400         }
2401
2402         CFRange theRange = { 0, nOutSize };
2403
2404 #if SIZEOF_WCHAR_T == 4
2405         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2406 #endif
2407
2408         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2409
2410         CFRelease(theString);
2411
2412         szUniCharBuffer[nOutLength] = '\0' ;
2413
2414 #if SIZEOF_WCHAR_T == 4
2415         wxMBConvUTF16 converter ;
2416         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2417         delete[] szUniCharBuffer;
2418 #endif
2419
2420         return nOutLength;
2421     }
2422
2423     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2424     {
2425         wxASSERT(szUnConv);
2426
2427         size_t nRealOutSize;
2428         size_t nBufSize = wxWcslen(szUnConv);
2429         UniChar* szUniBuffer = (UniChar*) szUnConv;
2430
2431 #if SIZEOF_WCHAR_T == 4
2432         wxMBConvUTF16 converter ;
2433         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2434         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2435         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2436         nBufSize /= sizeof(UniChar);
2437 #endif
2438
2439         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2440                                 NULL, //allocator
2441                                 szUniBuffer,
2442                                 nBufSize,
2443                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2444                             );
2445
2446         wxASSERT(theString);
2447
2448         //Note that CER puts a BOM when converting to unicode
2449         //so we  check and use getchars instead in that case
2450         if (m_encoding == kCFStringEncodingUnicode)
2451         {
2452             if (szOut != NULL)
2453                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2454
2455             nRealOutSize = CFStringGetLength(theString) + 1;
2456         }
2457         else
2458         {
2459             CFStringGetBytes(
2460                 theString,
2461                 CFRangeMake(0, CFStringGetLength(theString)),
2462                 m_encoding,
2463                 0, //what to put in characters that can't be converted -
2464                     //0 tells CFString to return NULL if it meets such a character
2465                 false, //not an external representation
2466                 (UInt8*) szOut,
2467                 nOutSize,
2468                 (CFIndex*) &nRealOutSize
2469                         );
2470         }
2471
2472         CFRelease(theString);
2473
2474 #if SIZEOF_WCHAR_T == 4
2475         delete[] szUniBuffer;
2476 #endif
2477
2478         return  nRealOutSize - 1;
2479     }
2480
2481     bool IsOk() const
2482     {
2483         return m_encoding != kCFStringEncodingInvalidId &&
2484               CFStringIsEncodingAvailable(m_encoding);
2485     }
2486
2487 private:
2488     CFStringEncoding m_encoding ;
2489 };
2490
2491 #endif // defined(__WXCOCOA__)
2492
2493 // ============================================================================
2494 // Mac conversion classes
2495 // ============================================================================
2496
2497 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2498
2499 class wxMBConv_mac : public wxMBConv
2500 {
2501 public:
2502     wxMBConv_mac()
2503     {
2504         Init(CFStringGetSystemEncoding()) ;
2505     }
2506
2507 #if wxUSE_FONTMAP
2508     wxMBConv_mac(const wxChar* name)
2509     {
2510         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2511     }
2512 #endif
2513
2514     wxMBConv_mac(wxFontEncoding encoding)
2515     {
2516         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2517     }
2518
2519     ~wxMBConv_mac()
2520     {
2521         OSStatus status = noErr ;
2522         status = TECDisposeConverter(m_MB2WC_converter);
2523         status = TECDisposeConverter(m_WC2MB_converter);
2524     }
2525
2526
2527     void Init( TextEncodingBase encoding)
2528     {
2529         OSStatus status = noErr ;
2530         m_char_encoding = encoding ;
2531         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2532
2533         status = TECCreateConverter(&m_MB2WC_converter,
2534                                     m_char_encoding,
2535                                     m_unicode_encoding);
2536         status = TECCreateConverter(&m_WC2MB_converter,
2537                                     m_unicode_encoding,
2538                                     m_char_encoding);
2539     }
2540
2541     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2542     {
2543         OSStatus status = noErr ;
2544         ByteCount byteOutLen ;
2545         ByteCount byteInLen = strlen(psz) ;
2546         wchar_t *tbuf = NULL ;
2547         UniChar* ubuf = NULL ;
2548         size_t res = 0 ;
2549
2550         if (buf == NULL)
2551         {
2552             //apple specs say at least 32
2553             n = wxMax( 32 , byteInLen ) ;
2554             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2555         }
2556         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2557 #if SIZEOF_WCHAR_T == 4
2558         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2559 #else
2560         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2561 #endif
2562         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2563           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2564 #if SIZEOF_WCHAR_T == 4
2565         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2566         // is not properly terminated we get random characters at the end
2567         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2568         wxMBConvUTF16 converter ;
2569         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2570         free( ubuf ) ;
2571 #else
2572         res = byteOutLen / sizeof( UniChar ) ;
2573 #endif
2574         if ( buf == NULL )
2575              free(tbuf) ;
2576
2577         if ( buf  && res < n)
2578             buf[res] = 0;
2579
2580         return res ;
2581     }
2582
2583     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2584     {
2585         OSStatus status = noErr ;
2586         ByteCount byteOutLen ;
2587         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2588
2589         char *tbuf = NULL ;
2590
2591         if (buf == NULL)
2592         {
2593             //apple specs say at least 32
2594             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2595             tbuf = (char*) malloc( n ) ;
2596         }
2597
2598         ByteCount byteBufferLen = n ;
2599         UniChar* ubuf = NULL ;
2600 #if SIZEOF_WCHAR_T == 4
2601         wxMBConvUTF16 converter ;
2602         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2603         byteInLen = unicharlen ;
2604         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2605         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2606 #else
2607         ubuf = (UniChar*) psz ;
2608 #endif
2609         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2610             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2611 #if SIZEOF_WCHAR_T == 4
2612         free( ubuf ) ;
2613 #endif
2614         if ( buf == NULL )
2615             free(tbuf) ;
2616
2617         size_t res = byteOutLen ;
2618         if ( buf  && res < n)
2619         {
2620             buf[res] = 0;
2621
2622             //we need to double-trip to verify it didn't insert any ? in place
2623             //of bogus characters
2624             wxWCharBuffer wcBuf(n);
2625             size_t pszlen = wxWcslen(psz);
2626             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2627                         wxWcslen(wcBuf) != pszlen ||
2628                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2629             {
2630                 // we didn't obtain the same thing we started from, hence
2631                 // the conversion was lossy and we consider that it failed
2632                 return (size_t)-1;
2633             }
2634         }
2635
2636         return res ;
2637     }
2638
2639     bool IsOk() const
2640         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2641
2642 private:
2643     TECObjectRef m_MB2WC_converter ;
2644     TECObjectRef m_WC2MB_converter ;
2645
2646     TextEncodingBase m_char_encoding ;
2647     TextEncodingBase m_unicode_encoding ;
2648 };
2649
2650 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2651
2652 // ============================================================================
2653 // wxEncodingConverter based conversion classes
2654 // ============================================================================
2655
2656 #if wxUSE_FONTMAP
2657
2658 class wxMBConv_wxwin : public wxMBConv
2659 {
2660 private:
2661     void Init()
2662     {
2663         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2664                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2665     }
2666
2667 public:
2668     // temporarily just use wxEncodingConverter stuff,
2669     // so that it works while a better implementation is built
2670     wxMBConv_wxwin(const wxChar* name)
2671     {
2672         if (name)
2673             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2674         else
2675             m_enc = wxFONTENCODING_SYSTEM;
2676
2677         Init();
2678     }
2679
2680     wxMBConv_wxwin(wxFontEncoding enc)
2681     {
2682         m_enc = enc;
2683
2684         Init();
2685     }
2686
2687     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2688     {
2689         size_t inbuf = strlen(psz);
2690         if (buf)
2691         {
2692             if (!m2w.Convert(psz,buf))
2693                 return (size_t)-1;
2694         }
2695         return inbuf;
2696     }
2697
2698     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2699     {
2700         const size_t inbuf = wxWcslen(psz);
2701         if (buf)
2702         {
2703             if (!w2m.Convert(psz,buf))
2704                 return (size_t)-1;
2705         }
2706
2707         return inbuf;
2708     }
2709
2710     virtual size_t GetMBNulLen() const
2711     {
2712         switch ( m_enc )
2713         {
2714             case wxFONTENCODING_UTF16BE:
2715             case wxFONTENCODING_UTF16LE:
2716                 return 2;
2717
2718             case wxFONTENCODING_UTF32BE:
2719             case wxFONTENCODING_UTF32LE:
2720                 return 4;
2721
2722             default:
2723                 return 1;
2724         }
2725     }
2726
2727     bool IsOk() const { return m_ok; }
2728
2729 public:
2730     wxFontEncoding m_enc;
2731     wxEncodingConverter m2w, w2m;
2732
2733 private:
2734     // were we initialized successfully?
2735     bool m_ok;
2736
2737     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2738 };
2739
2740 // make the constructors available for unit testing
2741 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2742 {
2743     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2744     if ( !result->IsOk() )
2745     {
2746         delete result;
2747         return 0;
2748     }
2749     return result;
2750 }
2751
2752 #endif // wxUSE_FONTMAP
2753
2754 // ============================================================================
2755 // wxCSConv implementation
2756 // ============================================================================
2757
2758 void wxCSConv::Init()
2759 {
2760     m_name = NULL;
2761     m_convReal =  NULL;
2762     m_deferred = true;
2763 }
2764
2765 wxCSConv::wxCSConv(const wxChar *charset)
2766 {
2767     Init();
2768
2769     if ( charset )
2770     {
2771         SetName(charset);
2772     }
2773
2774 #if wxUSE_FONTMAP
2775     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2776 #else
2777     m_encoding = wxFONTENCODING_SYSTEM;
2778 #endif
2779 }
2780
2781 wxCSConv::wxCSConv(wxFontEncoding encoding)
2782 {
2783     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2784     {
2785         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2786
2787         encoding = wxFONTENCODING_SYSTEM;
2788     }
2789
2790     Init();
2791
2792     m_encoding = encoding;
2793 }
2794
2795 wxCSConv::~wxCSConv()
2796 {
2797     Clear();
2798 }
2799
2800 wxCSConv::wxCSConv(const wxCSConv& conv)
2801         : wxMBConv()
2802 {
2803     Init();
2804
2805     SetName(conv.m_name);
2806     m_encoding = conv.m_encoding;
2807 }
2808
2809 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2810 {
2811     Clear();
2812
2813     SetName(conv.m_name);
2814     m_encoding = conv.m_encoding;
2815
2816     return *this;
2817 }
2818
2819 void wxCSConv::Clear()
2820 {
2821     free(m_name);
2822     delete m_convReal;
2823
2824     m_name = NULL;
2825     m_convReal = NULL;
2826 }
2827
2828 void wxCSConv::SetName(const wxChar *charset)
2829 {
2830     if (charset)
2831     {
2832         m_name = wxStrdup(charset);
2833         m_deferred = true;
2834     }
2835 }
2836
2837 #if wxUSE_FONTMAP
2838 #include "wx/hashmap.h"
2839
2840 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2841                      wxEncodingNameCache );
2842
2843 static wxEncodingNameCache gs_nameCache;
2844 #endif
2845
2846 wxMBConv *wxCSConv::DoCreate() const
2847 {
2848 #if wxUSE_FONTMAP
2849     wxLogTrace(TRACE_STRCONV,
2850                wxT("creating conversion for %s"),
2851                (m_name ? m_name
2852                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2853 #endif // wxUSE_FONTMAP
2854
2855     // check for the special case of ASCII or ISO8859-1 charset: as we have
2856     // special knowledge of it anyhow, we don't need to create a special
2857     // conversion object
2858     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2859             m_encoding == wxFONTENCODING_DEFAULT )
2860     {
2861         // don't convert at all
2862         return NULL;
2863     }
2864
2865     // we trust OS to do conversion better than we can so try external
2866     // conversion methods first
2867     //
2868     // the full order is:
2869     //      1. OS conversion (iconv() under Unix or Win32 API)
2870     //      2. hard coded conversions for UTF
2871     //      3. wxEncodingConverter as fall back
2872
2873     // step (1)
2874 #ifdef HAVE_ICONV
2875 #if !wxUSE_FONTMAP
2876     if ( m_name )
2877 #endif // !wxUSE_FONTMAP
2878     {
2879         wxString name(m_name);
2880         wxFontEncoding encoding(m_encoding);
2881
2882         if ( !name.empty() )
2883         {
2884             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2885             if ( conv->IsOk() )
2886                 return conv;
2887
2888             delete conv;
2889
2890 #if wxUSE_FONTMAP
2891             encoding =
2892                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2893 #endif // wxUSE_FONTMAP
2894         }
2895 #if wxUSE_FONTMAP
2896         {
2897             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2898             if ( it != gs_nameCache.end() )
2899             {
2900                 if ( it->second.empty() )
2901                     return NULL;
2902
2903                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
2904                 if ( conv->IsOk() )
2905                     return conv;
2906
2907                 delete conv;
2908             }
2909
2910             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2911
2912             for ( ; *names; ++names )
2913             {
2914                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
2915                 if ( conv->IsOk() )
2916                 {
2917                     gs_nameCache[encoding] = *names;
2918                     return conv;
2919                 }
2920
2921                 delete conv;
2922             }
2923
2924             gs_nameCache[encoding] = _T(""); // cache the failure
2925         }
2926 #endif // wxUSE_FONTMAP
2927     }
2928 #endif // HAVE_ICONV
2929
2930 #ifdef wxHAVE_WIN32_MB2WC
2931     {
2932 #if wxUSE_FONTMAP
2933         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2934                                       : new wxMBConv_win32(m_encoding);
2935         if ( conv->IsOk() )
2936             return conv;
2937
2938         delete conv;
2939 #else
2940         return NULL;
2941 #endif
2942     }
2943 #endif // wxHAVE_WIN32_MB2WC
2944 #if defined(__WXMAC__)
2945     {
2946         // leave UTF16 and UTF32 to the built-ins of wx
2947         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2948             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2949         {
2950
2951 #if wxUSE_FONTMAP
2952             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2953                                         : new wxMBConv_mac(m_encoding);
2954 #else
2955             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2956 #endif
2957             if ( conv->IsOk() )
2958                  return conv;
2959
2960             delete conv;
2961         }
2962     }
2963 #endif
2964 #if defined(__WXCOCOA__)
2965     {
2966         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2967         {
2968
2969 #if wxUSE_FONTMAP
2970             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2971                                           : new wxMBConv_cocoa(m_encoding);
2972 #else
2973             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2974 #endif
2975             if ( conv->IsOk() )
2976                  return conv;
2977
2978             delete conv;
2979         }
2980     }
2981 #endif
2982     // step (2)
2983     wxFontEncoding enc = m_encoding;
2984 #if wxUSE_FONTMAP
2985     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2986     {
2987         // use "false" to suppress interactive dialogs -- we can be called from
2988         // anywhere and popping up a dialog from here is the last thing we want to
2989         // do
2990         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2991     }
2992 #endif // wxUSE_FONTMAP
2993
2994     switch ( enc )
2995     {
2996         case wxFONTENCODING_UTF7:
2997              return new wxMBConvUTF7;
2998
2999         case wxFONTENCODING_UTF8:
3000              return new wxMBConvUTF8;
3001
3002         case wxFONTENCODING_UTF16BE:
3003              return new wxMBConvUTF16BE;
3004
3005         case wxFONTENCODING_UTF16LE:
3006              return new wxMBConvUTF16LE;
3007
3008         case wxFONTENCODING_UTF32BE:
3009              return new wxMBConvUTF32BE;
3010
3011         case wxFONTENCODING_UTF32LE:
3012              return new wxMBConvUTF32LE;
3013
3014         default:
3015              // nothing to do but put here to suppress gcc warnings
3016              ;
3017     }
3018
3019     // step (3)
3020 #if wxUSE_FONTMAP
3021     {
3022         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3023                                       : new wxMBConv_wxwin(m_encoding);
3024         if ( conv->IsOk() )
3025             return conv;
3026
3027         delete conv;
3028     }
3029 #endif // wxUSE_FONTMAP
3030
3031     // NB: This is a hack to prevent deadlock. What could otherwise happen
3032     //     in Unicode build: wxConvLocal creation ends up being here
3033     //     because of some failure and logs the error. But wxLog will try to
3034     //     attach timestamp, for which it will need wxConvLocal (to convert
3035     //     time to char* and then wchar_t*), but that fails, tries to log
3036     //     error, but wxLog has a (already locked) critical section that
3037     //     guards static buffer.
3038     static bool alreadyLoggingError = false;
3039     if (!alreadyLoggingError)
3040     {
3041         alreadyLoggingError = true;
3042         wxLogError(_("Cannot convert from the charset '%s'!"),
3043                    m_name ? m_name
3044                       :
3045 #if wxUSE_FONTMAP
3046                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3047 #else // !wxUSE_FONTMAP
3048                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3049 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3050               );
3051         alreadyLoggingError = false;
3052     }
3053
3054     return NULL;
3055 }
3056
3057 void wxCSConv::CreateConvIfNeeded() const
3058 {
3059     if ( m_deferred )
3060     {
3061         wxCSConv *self = (wxCSConv *)this; // const_cast
3062
3063 #if wxUSE_INTL
3064         // if we don't have neither the name nor the encoding, use the default
3065         // encoding for this system
3066         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3067         {
3068             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3069         }
3070 #endif // wxUSE_INTL
3071
3072         self->m_convReal = DoCreate();
3073         self->m_deferred = false;
3074     }
3075 }
3076
3077 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3078 {
3079     CreateConvIfNeeded();
3080
3081     if (m_convReal)
3082         return m_convReal->MB2WC(buf, psz, n);
3083
3084     // latin-1 (direct)
3085     size_t len = strlen(psz);
3086
3087     if (buf)
3088     {
3089         for (size_t c = 0; c <= len; c++)
3090             buf[c] = (unsigned char)(psz[c]);
3091     }
3092
3093     return len;
3094 }
3095
3096 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3097 {
3098     CreateConvIfNeeded();
3099
3100     if (m_convReal)
3101         return m_convReal->WC2MB(buf, psz, n);
3102
3103     // latin-1 (direct)
3104     const size_t len = wxWcslen(psz);
3105     if (buf)
3106     {
3107         for (size_t c = 0; c <= len; c++)
3108         {
3109             if (psz[c] > 0xFF)
3110                 return (size_t)-1;
3111             buf[c] = (char)psz[c];
3112         }
3113     }
3114     else
3115     {
3116         for (size_t c = 0; c <= len; c++)
3117         {
3118             if (psz[c] > 0xFF)
3119                 return (size_t)-1;
3120         }
3121     }
3122
3123     return len;
3124 }
3125
3126 size_t wxCSConv::GetMBNulLen() const
3127 {
3128     CreateConvIfNeeded();
3129
3130     if ( m_convReal )
3131     {
3132         return m_convReal->GetMBNulLen();
3133     }
3134
3135     return 1;
3136 }
3137
3138 // ----------------------------------------------------------------------------
3139 // globals
3140 // ----------------------------------------------------------------------------
3141
3142 #ifdef __WINDOWS__
3143     static wxMBConv_win32 wxConvLibcObj;
3144 #elif defined(__WXMAC__) && !defined(__MACH__)
3145     static wxMBConv_mac wxConvLibcObj ;
3146 #else
3147     static wxMBConvLibc wxConvLibcObj;
3148 #endif
3149
3150 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3151 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3152 static wxMBConvUTF7 wxConvUTF7Obj;
3153 static wxMBConvUTF8 wxConvUTF8Obj;
3154
3155 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3156 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3157 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3158 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3159 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3160 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3161 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3162 #ifdef __WXOSX__
3163                                     wxConvUTF8Obj;
3164 #else
3165                                     wxConvLibcObj;
3166 #endif
3167
3168
3169 #else // !wxUSE_WCHAR_T
3170
3171 // stand-ins in absence of wchar_t
3172 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3173                                 wxConvISO8859_1,
3174                                 wxConvLocal,
3175                                 wxConvUTF8;
3176
3177 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T