src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 // For compilers that support precompilation, includes "wx.h".
  24 #include "wx/wxprec.h"
  25
  26 #ifdef __BORLANDC__
  27   #pragma hdrstop
  28 #endif
  29
  30 #ifndef WX_PRECOMP
  31     #include "wx/intl.h"
  32     #include "wx/log.h"
  33 #endif // WX_PRECOMP
  34
  35 #include "wx/strconv.h"
  36
  37 #if wxUSE_WCHAR_T
  38
  39 #ifdef __WINDOWS__
  40     #include "wx/msw/private.h"
  41     #include "wx/msw/missing.h"
  42 #endif
  43
  44 #ifndef __WXWINCE__
  45 #include <errno.h>
  46 #endif
  47
  48 #include <ctype.h>
  49 #include <string.h>
  50 #include <stdlib.h>
  51
  52 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  53     #define wxHAVE_WIN32_MB2WC
  54 #endif // __WIN32__ but !__WXMICROWIN__
  55
  56 #ifdef __SALFORDC__
  57     #include <clib.h>
  58 #endif
  59
  60 #ifdef HAVE_ICONV
  61     #include <iconv.h>
  62     #include "wx/thread.h"
  63 #endif
  64
  65 #include "wx/encconv.h"
  66 #include "wx/fontmap.h"
  67 #include "wx/utils.h"
  68
  69 #ifdef __WXMAC__
  70 #ifndef __DARWIN__
  71 #include <ATSUnicode.h>
  72 #include <TextCommon.h>
  73 #include <TextEncodingConverter.h>
  74 #endif
  75
  76 #include  "wx/mac/private.h"  // includes mac headers
  77 #endif
  78
  79 #define TRACE_STRCONV _T("strconv")
  80
  81 #if SIZEOF_WCHAR_T == 2
  82     #define WC_UTF16
  83 #endif
  84
  85 // ============================================================================
  86 // implementation
  87 // ============================================================================
  88
  89 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  90 static bool NotAllNULs(const char *p, size_t n)
  91 {
  92     while ( n && *p++ == '\0' )
  93         n--;
  94
  95     return n != 0;
  96 }
  97
  98 // ----------------------------------------------------------------------------
  99 // UTF-16 en/decoding to/from UCS-4
 100 // ----------------------------------------------------------------------------
 101
 102
 103 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 104 {
 105     if (input<=0xffff)
 106     {
 107         if (output)
 108             *output = (wxUint16) input;
 109         return 1;
 110     }
 111     else if (input>=0x110000)
 112     {
 113         return (size_t)-1;
 114     }
 115     else
 116     {
 117         if (output)
 118         {
 119             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 120             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 121         }
 122         return 2;
 123     }
 124 }
 125
 126 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 127 {
 128     if ((*input<0xd800) || (*input>0xdfff))
 129     {
 130         output = *input;
 131         return 1;
 132     }
 133     else if ((input[1]<0xdc00) || (input[1]>0xdfff))
 134     {
 135         output = *input;
 136         return (size_t)-1;
 137     }
 138     else
 139     {
 140         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 141         return 2;
 142     }
 143 }
 144
 145
 146 // ----------------------------------------------------------------------------
 147 // wxMBConv
 148 // ----------------------------------------------------------------------------
 149
 150 size_t
 151 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 152                   const char *src, size_t srcLen) const
 153 {
 154     // although new conversion classes are supposed to implement this function
 155     // directly, the existins ones only implement the old MB2WC() and so, to
 156     // avoid to have to rewrite all conversion classes at once, we provide a
 157     // default (but not efficient) implementation of this one in terms of the
 158     // old function by copying the input to ensure that it's NUL-terminated and
 159     // then using MB2WC() to convert it
 160
 161     // the number of chars [which would be] written to dst [if it were not NULL]
 162     size_t dstWritten = 0;
 163
 164     // the number of NULs terminating this string
 165     size_t nulLen wxDUMMY_INITIALIZE(0);
 166
 167     // if we were not given the input size we just have to assume that the
 168     // string is properly terminated as we have no way of knowing how long it
 169     // is anyhow, but if we do have the size check whether there are enough
 170     // NULs at the end
 171     wxCharBuffer bufTmp;
 172     const char *srcEnd;
 173     if ( srcLen != (size_t)-1 )
 174     {
 175         // we need to know how to find the end of this string
 176         nulLen = GetMBNulLen();
 177         if ( nulLen == wxCONV_FAILED )
 178             return wxCONV_FAILED;
 179
 180         // if there are enough NULs we can avoid the copy
 181         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 182         {
 183             // make a copy in order to properly NUL-terminate the string
 184             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 185             char * const p = bufTmp.data();
 186             memcpy(p, src, srcLen);
 187             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 188                 *s = '\0';
 189
 190             src = bufTmp;
 191         }
 192
 193         srcEnd = src + srcLen;
 194     }
 195     else // quit after the first loop iteration
 196     {
 197         srcEnd = NULL;
 198     }
 199
 200     for ( ;; )
 201     {
 202         // try to convert the current chunk
 203         size_t lenChunk = MB2WC(NULL, src, 0);
 204         if ( lenChunk == 0 )
 205         {
 206             // nothing left in the input string, conversion succeeded;
 207             // but still account for the trailing NULL
 208             dstWritten++;
 209             break;
 210         }
 211
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for trailing NUL
 216
 217         dstWritten += lenChunk;
 218
 219         if ( dst )
 220         {
 221             if ( dstWritten > dstLen )
 222                 return wxCONV_FAILED;
 223
 224             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 225                 return wxCONV_FAILED;
 226
 227             dst += lenChunk;
 228         }
 229
 230         if ( !srcEnd )
 231         {
 232             // we convert the entire string in this case, as we suppose that the
 233             // string is NUL-terminated and so srcEnd is not used at all
 234             break;
 235         }
 236
 237         // advance the input pointer past the end of this chunk
 238         while ( NotAllNULs(src, nulLen) )
 239         {
 240             // notice that we must skip over multiple bytes here as we suppose
 241             // that if NUL takes 2 or 4 bytes, then all the other characters do
 242             // too and so if advanced by a single byte we might erroneously
 243             // detect sequences of NUL bytes in the middle of the input
 244             src += nulLen;
 245         }
 246
 247         src += nulLen; // skipping over its terminator as well
 248
 249         // note that ">=" (and not just "==") is needed here as the terminator
 250         // we skipped just above could be inside or just after the buffer
 251         // delimited by inEnd
 252         if ( src >= srcEnd )
 253             break;
 254     }
 255
 256     return dstWritten;
 257 }
 258
 259 size_t
 260 wxMBConv::FromWChar(char *dst, size_t dstLen,
 261                     const wchar_t *src, size_t srcLen) const
 262 {
 263     // the number of chars [which would be] written to dst [if it were not NULL]
 264     size_t dstWritten = 0;
 265
 266     // make a copy of the input string unless it is already properly
 267     // NUL-terminated
 268     //
 269     // if we don't know its length we have no choice but to assume that it is,
 270     // indeed, properly terminated
 271     wxWCharBuffer bufTmp;
 272     if ( srcLen == (size_t)-1 )
 273     {
 274         srcLen = wxWcslen(src) + 1;
 275     }
 276     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 277     {
 278         // make a copy in order to properly NUL-terminate the string
 279         bufTmp = wxWCharBuffer(srcLen);
 280         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 281         src = bufTmp;
 282     }
 283
 284     const size_t lenNul = GetMBNulLen();
 285     for ( const wchar_t * const srcEnd = src + srcLen;
 286           src < srcEnd;
 287           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 288     {
 289         // try to convert the current chunk
 290         size_t lenChunk = WC2MB(NULL, src, 0);
 291
 292         if ( lenChunk == wxCONV_FAILED )
 293             return wxCONV_FAILED;
 294
 295         lenChunk += lenNul;
 296         dstWritten += lenChunk;
 297
 298         if ( dst )
 299         {
 300             if ( dstWritten > dstLen )
 301                 return wxCONV_FAILED;
 302
 303             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 304                 return wxCONV_FAILED;
 305
 306             dst += lenChunk;
 307         }
 308     }
 309
 310     return dstWritten;
 311 }
 312
 313 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 314 {
 315     size_t rc = ToWChar(outBuff, outLen, inBuff);
 316     if ( rc != (size_t)wxCONV_FAILED )
 317     {
 318         // ToWChar() returns the buffer length, i.e. including the trailing
 319         // NUL, while this method doesn't take it into account
 320         rc--;
 321     }
 322
 323     return rc;
 324 }
 325
 326 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 327 {
 328     size_t rc = FromWChar(outBuff, outLen, inBuff);
 329     if ( rc != (size_t)wxCONV_FAILED )
 330     {
 331         rc -= GetMBNulLen();
 332     }
 333
 334     return rc;
 335 }
 336
 337 wxMBConv::~wxMBConv()
 338 {
 339     // nothing to do here (necessary for Darwin linking probably)
 340 }
 341
 342 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 343 {
 344     if ( psz )
 345     {
 346         // calculate the length of the buffer needed first
 347         const size_t nLen = MB2WC(NULL, psz, 0);
 348         if ( nLen != (size_t)wxCONV_FAILED )
 349         {
 350             // now do the actual conversion
 351             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 352
 353             // +1 for the trailing NULL
 354             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 355                 return buf;
 356         }
 357     }
 358
 359     return wxWCharBuffer();
 360 }
 361
 362 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 363 {
 364     if ( pwz )
 365     {
 366         const size_t nLen = WC2MB(NULL, pwz, 0);
 367         if ( nLen != (size_t)wxCONV_FAILED )
 368         {
 369             // extra space for trailing NUL(s)
 370             static const size_t extraLen = GetMaxMBNulLen();
 371
 372             wxCharBuffer buf(nLen + extraLen - 1);
 373             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 374                 return buf;
 375         }
 376     }
 377
 378     return wxCharBuffer();
 379 }
 380
 381 const wxWCharBuffer
 382 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 383 {
 384     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 385     if ( dstLen != (size_t)wxCONV_FAILED )
 386     {
 387         wxWCharBuffer wbuf(dstLen - 1);
 388         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) )
 389         {
 390             if ( outLen )
 391                 *outLen = dstLen - 1;
 392             return wbuf;
 393         }
 394     }
 395
 396     if ( outLen )
 397         *outLen = 0;
 398
 399     return wxWCharBuffer();
 400 }
 401
 402 const wxCharBuffer
 403 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 404 {
 405     const size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 406     if ( dstLen != (size_t)wxCONV_FAILED )
 407     {
 408         wxCharBuffer buf(dstLen - 1);
 409         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) )
 410         {
 411             if ( outLen )
 412                 *outLen = dstLen - 1;
 413
 414             return buf;
 415         }
 416     }
 417
 418     if ( outLen )
 419         *outLen = 0;
 420
 421     return wxCharBuffer();
 422 }
 423
 424 // ----------------------------------------------------------------------------
 425 // wxMBConvLibc
 426 // ----------------------------------------------------------------------------
 427
 428 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 429 {
 430     return wxMB2WC(buf, psz, n);
 431 }
 432
 433 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 434 {
 435     return wxWC2MB(buf, psz, n);
 436 }
 437
 438 // ----------------------------------------------------------------------------
 439 // wxConvBrokenFileNames
 440 // ----------------------------------------------------------------------------
 441
 442 #ifdef __UNIX__
 443
 444 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 445 {
 446     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 447                   || wxStricmp(charset, _T("UTF8")) == 0  )
 448         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 449     else
 450         m_conv = new wxCSConv(charset);
 451 }
 452
 453 #endif // __UNIX__
 454
 455 // ----------------------------------------------------------------------------
 456 // UTF-7
 457 // ----------------------------------------------------------------------------
 458
 459 // Implementation (C) 2004 Fredrik Roubert
 460
 461 //
 462 // BASE64 decoding table
 463 //
 464 static const unsigned char utf7unb64[] =
 465 {
 466     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 467     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 468     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 469     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 470     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 471     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 472     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 473     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 474     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 475     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 476     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 477     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 478     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 479     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 480     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 481     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 482     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 483     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 484     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 485     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 486     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 487     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 488     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 489     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 490     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 498 };
 499
 500 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 501 {
 502     size_t len = 0;
 503
 504     while ( *psz && (!buf || (len < n)) )
 505     {
 506         unsigned char cc = *psz++;
 507         if (cc != '+')
 508         {
 509             // plain ASCII char
 510             if (buf)
 511                 *buf++ = cc;
 512             len++;
 513         }
 514         else if (*psz == '-')
 515         {
 516             // encoded plus sign
 517             if (buf)
 518                 *buf++ = cc;
 519             len++;
 520             psz++;
 521         }
 522         else // start of BASE64 encoded string
 523         {
 524             bool lsb, ok;
 525             unsigned int d, l;
 526             for ( ok = lsb = false, d = 0, l = 0;
 527                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 528                   psz++ )
 529             {
 530                 d <<= 6;
 531                 d += cc;
 532                 for (l += 6; l >= 8; lsb = !lsb)
 533                 {
 534                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 535                     if (lsb)
 536                     {
 537                         if (buf)
 538                             *buf++ |= c;
 539                         len ++;
 540                     }
 541                     else
 542                     {
 543                         if (buf)
 544                             *buf = (wchar_t)(c << 8);
 545                     }
 546
 547                     ok = true;
 548                 }
 549             }
 550
 551             if ( !ok )
 552             {
 553                 // in valid UTF7 we should have valid characters after '+'
 554                 return (size_t)-1;
 555             }
 556
 557             if (*psz == '-')
 558                 psz++;
 559         }
 560     }
 561
 562     if ( buf && (len < n) )
 563         *buf = '\0';
 564
 565     return len;
 566 }
 567
 568 //
 569 // BASE64 encoding table
 570 //
 571 static const unsigned char utf7enb64[] =
 572 {
 573     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 574     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 575     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 576     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 577     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 578     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 579     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 580     '4', '5', '6', '7', '8', '9', '+', '/'
 581 };
 582
 583 //
 584 // UTF-7 encoding table
 585 //
 586 // 0 - Set D (directly encoded characters)
 587 // 1 - Set O (optional direct characters)
 588 // 2 - whitespace characters (optional)
 589 // 3 - special characters
 590 //
 591 static const unsigned char utf7encode[128] =
 592 {
 593     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 594     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 595     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 596     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 597     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 598     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 599     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 600     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 601 };
 602
 603 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 604 {
 605     size_t len = 0;
 606
 607     while (*psz && ((!buf) || (len < n)))
 608     {
 609         wchar_t cc = *psz++;
 610         if (cc < 0x80 && utf7encode[cc] < 1)
 611         {
 612             // plain ASCII char
 613             if (buf)
 614                 *buf++ = (char)cc;
 615
 616             len++;
 617         }
 618 #ifndef WC_UTF16
 619         else if (((wxUint32)cc) > 0xffff)
 620         {
 621             // no surrogate pair generation (yet?)
 622             return (size_t)-1;
 623         }
 624 #endif
 625         else
 626         {
 627             if (buf)
 628                 *buf++ = '+';
 629             len++;
 630             if (cc != '+')
 631             {
 632                 // BASE64 encode string
 633                 unsigned int lsb, d, l;
 634                 for (d = 0, l = 0; /*nothing*/; psz++)
 635                 {
 636                     for (lsb = 0; lsb < 2; lsb ++)
 637                     {
 638                         d <<= 8;
 639                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 640
 641                         for (l += 8; l >= 6; )
 642                         {
 643                             l -= 6;
 644                             if (buf)
 645                                 *buf++ = utf7enb64[(d >> l) % 64];
 646                             len++;
 647                         }
 648                     }
 649                     cc = *psz;
 650                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 651                         break;
 652                 }
 653                 if (l != 0)
 654                 {
 655                     if (buf)
 656                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 657                     len++;
 658                 }
 659             }
 660
 661             if (buf)
 662                 *buf++ = '-';
 663             len++;
 664         }
 665     }
 666
 667     if (buf && (len < n))
 668         *buf = 0;
 669
 670     return len;
 671 }
 672
 673 // ----------------------------------------------------------------------------
 674 // UTF-8
 675 // ----------------------------------------------------------------------------
 676
 677 static wxUint32 utf8_max[]=
 678     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 679
 680 // boundaries of the private use area we use to (temporarily) remap invalid
 681 // characters invalid in a UTF-8 encoded string
 682 const wxUint32 wxUnicodePUA = 0x100000;
 683 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 684
 685 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 686 {
 687     size_t len = 0;
 688
 689     while (*psz && ((!buf) || (len < n)))
 690     {
 691         const char *opsz = psz;
 692         bool invalid = false;
 693         unsigned char cc = *psz++, fc = cc;
 694         unsigned cnt;
 695         for (cnt = 0; fc & 0x80; cnt++)
 696             fc <<= 1;
 697
 698         if (!cnt)
 699         {
 700             // plain ASCII char
 701             if (buf)
 702                 *buf++ = cc;
 703             len++;
 704
 705             // escape the escape character for octal escapes
 706             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 707                     && cc == '\\' && (!buf || len < n))
 708             {
 709                 if (buf)
 710                     *buf++ = cc;
 711                 len++;
 712             }
 713         }
 714         else
 715         {
 716             cnt--;
 717             if (!cnt)
 718             {
 719                 // invalid UTF-8 sequence
 720                 invalid = true;
 721             }
 722             else
 723             {
 724                 unsigned ocnt = cnt - 1;
 725                 wxUint32 res = cc & (0x3f >> cnt);
 726                 while (cnt--)
 727                 {
 728                     cc = *psz;
 729                     if ((cc & 0xC0) != 0x80)
 730                     {
 731                         // invalid UTF-8 sequence
 732                         invalid = true;
 733                         break;
 734                     }
 735
 736                     psz++;
 737                     res = (res << 6) | (cc & 0x3f);
 738                 }
 739                 if (invalid || res <= utf8_max[ocnt])
 740                 {
 741                     // illegal UTF-8 encoding
 742                     invalid = true;
 743                 }
 744                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 745                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 746                 {
 747                     // if one of our PUA characters turns up externally
 748                     // it must also be treated as an illegal sequence
 749                     // (a bit like you have to escape an escape character)
 750                     invalid = true;
 751                 }
 752                 else
 753                 {
 754 #ifdef WC_UTF16
 755                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 756                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 757                     if (pa == (size_t)-1)
 758                     {
 759                         invalid = true;
 760                     }
 761                     else
 762                     {
 763                         if (buf)
 764                             buf += pa;
 765                         len += pa;
 766                     }
 767 #else // !WC_UTF16
 768                     if (buf)
 769                         *buf++ = (wchar_t)res;
 770                     len++;
 771 #endif // WC_UTF16/!WC_UTF16
 772                 }
 773             }
 774             if (invalid)
 775             {
 776                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 777                 {
 778                     while (opsz < psz && (!buf || len < n))
 779                     {
 780 #ifdef WC_UTF16
 781                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 782                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 783                         wxASSERT(pa != (size_t)-1);
 784                         if (buf)
 785                             buf += pa;
 786                         opsz++;
 787                         len += pa;
 788 #else
 789                         if (buf)
 790                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 791                         opsz++;
 792                         len++;
 793 #endif
 794                     }
 795                 }
 796                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 797                 {
 798                     while (opsz < psz && (!buf || len < n))
 799                     {
 800                         if ( buf && len + 3 < n )
 801                         {
 802                             unsigned char on = *opsz;
 803                             *buf++ = L'\\';
 804                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 805                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 806                             *buf++ = (wchar_t)( L'0' + on % 010 );
 807                         }
 808
 809                         opsz++;
 810                         len += 4;
 811                     }
 812                 }
 813                 else // MAP_INVALID_UTF8_NOT
 814                 {
 815                     return (size_t)-1;
 816                 }
 817             }
 818         }
 819     }
 820
 821     if (buf && (len < n))
 822         *buf = 0;
 823
 824     return len;
 825 }
 826
 827 static inline bool isoctal(wchar_t wch)
 828 {
 829     return L'0' <= wch && wch <= L'7';
 830 }
 831
 832 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 833 {
 834     size_t len = 0;
 835
 836     while (*psz && ((!buf) || (len < n)))
 837     {
 838         wxUint32 cc;
 839
 840 #ifdef WC_UTF16
 841         // cast is ok for WC_UTF16
 842         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 843         psz += (pa == (size_t)-1) ? 1 : pa;
 844 #else
 845         cc = (*psz++) & 0x7fffffff;
 846 #endif
 847
 848         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 849                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 850         {
 851             if (buf)
 852                 *buf++ = (char)(cc - wxUnicodePUA);
 853             len++;
 854         }
 855         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 856                     && cc == L'\\' && psz[0] == L'\\' )
 857         {
 858             if (buf)
 859                 *buf++ = (char)cc;
 860             psz++;
 861             len++;
 862         }
 863         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 864                     cc == L'\\' &&
 865                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 866         {
 867             if (buf)
 868             {
 869                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 870                                  (psz[1] - L'0') * 010 +
 871                                  (psz[2] - L'0'));
 872             }
 873
 874             psz += 3;
 875             len++;
 876         }
 877         else
 878         {
 879             unsigned cnt;
 880             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 881             {
 882             }
 883
 884             if (!cnt)
 885             {
 886                 // plain ASCII char
 887                 if (buf)
 888                     *buf++ = (char) cc;
 889                 len++;
 890             }
 891
 892             else
 893             {
 894                 len += cnt + 1;
 895                 if (buf)
 896                 {
 897                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 898                     while (cnt--)
 899                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 900                 }
 901             }
 902         }
 903     }
 904
 905     if (buf && (len < n))
 906         *buf = 0;
 907
 908     return len;
 909 }
 910
 911 // ----------------------------------------------------------------------------
 912 // UTF-16
 913 // ----------------------------------------------------------------------------
 914
 915 #ifdef WORDS_BIGENDIAN
 916     #define wxMBConvUTF16straight wxMBConvUTF16BE
 917     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 918 #else
 919     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 920     #define wxMBConvUTF16straight wxMBConvUTF16LE
 921 #endif
 922
 923
 924 #ifdef WC_UTF16
 925
 926 // copy 16bit MB to 16bit String
 927 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 928 {
 929     size_t len = 0;
 930
 931     while (*(wxUint16*)psz && (!buf || len < n))
 932     {
 933         if (buf)
 934             *buf++ = *(wxUint16*)psz;
 935         len++;
 936
 937         psz += sizeof(wxUint16);
 938     }
 939
 940     if (buf && len < n)
 941        *buf = 0;
 942
 943     return len;
 944 }
 945
 946
 947 // copy 16bit String to 16bit MB
 948 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 949 {
 950     size_t len = 0;
 951
 952     while (*psz && (!buf || len < n))
 953     {
 954         if (buf)
 955         {
 956             *(wxUint16*)buf = *psz;
 957             buf += sizeof(wxUint16);
 958         }
 959
 960         len += sizeof(wxUint16);
 961         psz++;
 962     }
 963
 964     if (buf && len <= n - sizeof(wxUint16))
 965        *(wxUint16*)buf = 0;
 966
 967     return len;
 968 }
 969
 970
 971 // swap 16bit MB to 16bit String
 972 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 973 {
 974     size_t len = 0;
 975
 976     // UTF16 string must be terminated by 2 NULs as single NULs may occur
 977     // inside the string
 978     while ( (psz[0] || psz[1]) && (!buf || len < n) )
 979     {
 980         if ( buf )
 981         {
 982             ((char *)buf)[0] = psz[1];
 983             ((char *)buf)[1] = psz[0];
 984             buf++;
 985         }
 986         len++;
 987         psz += 2;
 988     }
 989
 990     if ( buf && len < n )
 991         *buf = L'\0';
 992
 993     return len;
 994 }
 995
 996
 997 // swap 16bit MB to 16bit String
 998 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 999 {
1000     size_t len = 0;
1001
1002     while ( *psz && (!buf || len < n) )
1003     {
1004         if ( buf )
1005         {
1006             *buf++ = ((char*)psz)[1];
1007             *buf++ = ((char*)psz)[0];
1008         }
1009
1010         len += 2;
1011         psz++;
1012     }
1013
1014     if ( buf && len < n - 1 )
1015     {
1016         buf[0] =
1017         buf[1] = '\0';
1018     }
1019
1020     return len;
1021 }
1022
1023
1024 #else // WC_UTF16
1025
1026
1027 // copy 16bit MB to 32bit String
1028 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1029 {
1030     size_t len = 0;
1031
1032     while (*(wxUint16*)psz && (!buf || len < n))
1033     {
1034         wxUint32 cc;
1035         size_t pa = decode_utf16((wxUint16*)psz, cc);
1036         if (pa == (size_t)-1)
1037             return pa;
1038
1039         if (buf)
1040             *buf++ = (wchar_t)cc;
1041         len++;
1042         psz += pa * sizeof(wxUint16);
1043     }
1044
1045     if (buf && len < n)
1046        *buf = 0;
1047
1048     return len;
1049 }
1050
1051
1052 // copy 32bit String to 16bit MB
1053 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1054 {
1055     size_t len=0;
1056
1057     while (*psz && (!buf || len < n))
1058     {
1059         wxUint16 cc[2];
1060         size_t pa = encode_utf16(*psz, cc);
1061
1062         if (pa == (size_t)-1)
1063             return pa;
1064
1065         if (buf)
1066         {
1067             *(wxUint16*)buf = cc[0];
1068             buf += sizeof(wxUint16);
1069             if (pa > 1)
1070             {
1071                 *(wxUint16*)buf = cc[1];
1072                 buf += sizeof(wxUint16);
1073             }
1074         }
1075
1076         len += pa*sizeof(wxUint16);
1077         psz++;
1078     }
1079
1080     if (buf && len <= n - sizeof(wxUint16))
1081        *(wxUint16*)buf = 0;
1082
1083     return len;
1084 }
1085
1086
1087 // swap 16bit MB to 32bit String
1088 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1089 {
1090     size_t len=0;
1091
1092     while (*(wxUint16*)psz && (!buf || len < n))
1093     {
1094         wxUint32 cc;
1095         char tmp[4];
1096
1097         tmp[0] = psz[1];
1098         tmp[1] = psz[0];
1099         tmp[2] = psz[3];
1100         tmp[3] = psz[2];
1101
1102         size_t pa = decode_utf16((wxUint16*)tmp, cc);
1103         if (pa == (size_t)-1)
1104             return pa;
1105
1106         if (buf)
1107             *buf++ = (wchar_t)cc;
1108
1109         len++;
1110         psz += pa * sizeof(wxUint16);
1111     }
1112
1113     if (buf && len < n)
1114        *buf = 0;
1115
1116     return len;
1117 }
1118
1119
1120 // swap 32bit String to 16bit MB
1121 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1122 {
1123     size_t len = 0;
1124
1125     while (*psz && (!buf || len < n))
1126     {
1127         wxUint16 cc[2];
1128         size_t pa = encode_utf16(*psz, cc);
1129
1130         if (pa == (size_t)-1)
1131             return pa;
1132
1133         if (buf)
1134         {
1135             *buf++ = ((char*)cc)[1];
1136             *buf++ = ((char*)cc)[0];
1137             if (pa > 1)
1138             {
1139                 *buf++ = ((char*)cc)[3];
1140                 *buf++ = ((char*)cc)[2];
1141             }
1142         }
1143
1144         len += pa * sizeof(wxUint16);
1145         psz++;
1146     }
1147
1148     if (buf && len <= n - sizeof(wxUint16))
1149        *(wxUint16*)buf = 0;
1150
1151     return len;
1152 }
1153
1154 #endif // WC_UTF16
1155
1156
1157 // ----------------------------------------------------------------------------
1158 // UTF-32
1159 // ----------------------------------------------------------------------------
1160
1161 #ifdef WORDS_BIGENDIAN
1162 #define wxMBConvUTF32straight  wxMBConvUTF32BE
1163 #define wxMBConvUTF32swap      wxMBConvUTF32LE
1164 #else
1165 #define wxMBConvUTF32swap      wxMBConvUTF32BE
1166 #define wxMBConvUTF32straight  wxMBConvUTF32LE
1167 #endif
1168
1169
1170 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1171 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1172
1173
1174 #ifdef WC_UTF16
1175
1176 // copy 32bit MB to 16bit String
1177 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1178 {
1179     size_t len = 0;
1180
1181     while (*(wxUint32*)psz && (!buf || len < n))
1182     {
1183         wxUint16 cc[2];
1184
1185         size_t pa = encode_utf16(*(wxUint32*)psz, cc);
1186         if (pa == (size_t)-1)
1187             return pa;
1188
1189         if (buf)
1190         {
1191             *buf++ = cc[0];
1192             if (pa > 1)
1193                 *buf++ = cc[1];
1194         }
1195
1196         len += pa;
1197         psz += sizeof(wxUint32);
1198     }
1199
1200     if (buf && len < n)
1201        *buf = 0;
1202
1203     return len;
1204 }
1205
1206
1207 // copy 16bit String to 32bit MB
1208 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1209 {
1210     size_t len = 0;
1211
1212     while (*psz && (!buf || len < n))
1213     {
1214         wxUint32 cc;
1215
1216         // cast is ok for WC_UTF16
1217         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1218         if (pa == (size_t)-1)
1219             return pa;
1220
1221         if (buf)
1222         {
1223             *(wxUint32*)buf = cc;
1224             buf += sizeof(wxUint32);
1225         }
1226
1227         len += sizeof(wxUint32);
1228         psz += pa;
1229     }
1230
1231     if (buf && len <= n - sizeof(wxUint32))
1232         *(wxUint32*)buf = 0;
1233
1234     return len;
1235 }
1236
1237
1238 // swap 32bit MB to 16bit String
1239 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1240 {
1241     size_t len = 0;
1242
1243     while (*(wxUint32*)psz && (!buf || len < n))
1244     {
1245         char tmp[4];
1246         tmp[0] = psz[3];
1247         tmp[1] = psz[2];
1248         tmp[2] = psz[1];
1249         tmp[3] = psz[0];
1250
1251         wxUint16 cc[2];
1252
1253         size_t pa = encode_utf16(*(wxUint32*)tmp, cc);
1254         if (pa == (size_t)-1)
1255             return pa;
1256
1257         if (buf)
1258         {
1259             *buf++ = cc[0];
1260             if (pa > 1)
1261                 *buf++ = cc[1];
1262         }
1263
1264         len += pa;
1265         psz += sizeof(wxUint32);
1266     }
1267
1268     if (buf && len < n)
1269         *buf = 0;
1270
1271     return len;
1272 }
1273
1274
1275 // swap 16bit String to 32bit MB
1276 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1277 {
1278     size_t len = 0;
1279
1280     while (*psz && (!buf || len < n))
1281     {
1282         char cc[4];
1283
1284         // cast is ok for WC_UTF16
1285         size_t pa = decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1286         if (pa == (size_t)-1)
1287             return pa;
1288
1289         if (buf)
1290         {
1291             *buf++ = cc[3];
1292             *buf++ = cc[2];
1293             *buf++ = cc[1];
1294             *buf++ = cc[0];
1295         }
1296
1297         len += sizeof(wxUint32);
1298         psz += pa;
1299     }
1300
1301     if (buf && len <= n - sizeof(wxUint32))
1302         *(wxUint32*)buf = 0;
1303
1304     return len;
1305 }
1306
1307 #else // WC_UTF16
1308
1309
1310 // copy 32bit MB to 32bit String
1311 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1312 {
1313     size_t len=0;
1314
1315     while (*(wxUint32*)psz && (!buf || len < n))
1316     {
1317         if (buf)
1318             *buf++ = (wchar_t)(*(wxUint32*)psz);
1319         len++;
1320         psz += sizeof(wxUint32);
1321     }
1322
1323     if (buf && len < n)
1324         *buf = 0;
1325
1326     return len;
1327 }
1328
1329
1330 // copy 32bit String to 32bit MB
1331 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1332 {
1333     size_t len = 0;
1334
1335     while (*psz && (!buf || len < n))
1336     {
1337         if (buf)
1338         {
1339             *(wxUint32*)buf = *psz;
1340             buf += sizeof(wxUint32);
1341         }
1342
1343         len += sizeof(wxUint32);
1344         psz++;
1345     }
1346
1347     if (buf && len <= n - sizeof(wxUint32))
1348         *(wxUint32*)buf = 0;
1349
1350     return len;
1351 }
1352
1353
1354 // swap 32bit MB to 32bit String
1355 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1356 {
1357     size_t len = 0;
1358
1359     while (*(wxUint32*)psz && (!buf || len < n))
1360     {
1361         if (buf)
1362         {
1363             ((char *)buf)[0] = psz[3];
1364             ((char *)buf)[1] = psz[2];
1365             ((char *)buf)[2] = psz[1];
1366             ((char *)buf)[3] = psz[0];
1367             buf++;
1368         }
1369
1370         len++;
1371         psz += sizeof(wxUint32);
1372     }
1373
1374     if (buf && len < n)
1375         *buf = 0;
1376
1377     return len;
1378 }
1379
1380
1381 // swap 32bit String to 32bit MB
1382 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1383 {
1384     size_t len = 0;
1385
1386     while (*psz && (!buf || len < n))
1387     {
1388         if (buf)
1389         {
1390             *buf++ = ((char *)psz)[3];
1391             *buf++ = ((char *)psz)[2];
1392             *buf++ = ((char *)psz)[1];
1393             *buf++ = ((char *)psz)[0];
1394         }
1395
1396         len += sizeof(wxUint32);
1397         psz++;
1398     }
1399
1400     if (buf && len <= n - sizeof(wxUint32))
1401         *(wxUint32*)buf = 0;
1402
1403     return len;
1404 }
1405
1406
1407 #endif // WC_UTF16
1408
1409
1410 // ============================================================================
1411 // The classes doing conversion using the iconv_xxx() functions
1412 // ============================================================================
1413
1414 #ifdef HAVE_ICONV
1415
1416 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1417 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1418 //     (unless there's yet another bug in glibc) the only case when iconv()
1419 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1420 //     left in the input buffer -- when _real_ error occurs,
1421 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1422 //     iconv() failure.
1423 //     [This bug does not appear in glibc 2.2.]
1424 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1425 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1426                                      (errno != E2BIG || bufLeft != 0))
1427 #else
1428 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1429 #endif
1430
1431 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1432
1433 #define ICONV_T_INVALID ((iconv_t)-1)
1434
1435 #if SIZEOF_WCHAR_T == 4
1436     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1437     #define WC_ENC      wxFONTENCODING_UTF32
1438 #elif SIZEOF_WCHAR_T == 2
1439     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1440     #define WC_ENC      wxFONTENCODING_UTF16
1441 #else // sizeof(wchar_t) != 2 nor 4
1442     // does this ever happen?
1443     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1444 #endif
1445
1446 // ----------------------------------------------------------------------------
1447 // wxMBConv_iconv: encapsulates an iconv character set
1448 // ----------------------------------------------------------------------------
1449
1450 class wxMBConv_iconv : public wxMBConv
1451 {
1452 public:
1453     wxMBConv_iconv(const wxChar *name);
1454     virtual ~wxMBConv_iconv();
1455
1456     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1457     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1458
1459     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1460     virtual size_t GetMBNulLen() const;
1461
1462     virtual wxMBConv *Clone() const
1463     {
1464         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1465         p->m_minMBCharWidth = m_minMBCharWidth;
1466         return p;
1467     }
1468
1469     bool IsOk() const
1470         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1471
1472 protected:
1473     // the iconv handlers used to translate from multibyte to wide char and in
1474     // the other direction
1475     iconv_t m2w,
1476             w2m;
1477
1478 #if wxUSE_THREADS
1479     // guards access to m2w and w2m objects
1480     wxMutex m_iconvMutex;
1481 #endif
1482
1483 private:
1484     // the name (for iconv_open()) of a wide char charset -- if none is
1485     // available on this machine, it will remain NULL
1486     static wxString ms_wcCharsetName;
1487
1488     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1489     // different endian-ness than the native one
1490     static bool ms_wcNeedsSwap;
1491
1492
1493     // name of the encoding handled by this conversion
1494     wxString m_name;
1495
1496     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1497     // initially
1498     size_t m_minMBCharWidth;
1499 };
1500
1501 // make the constructor available for unit testing
1502 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1503 {
1504     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1505     if ( !result->IsOk() )
1506     {
1507         delete result;
1508         return 0;
1509     }
1510
1511     return result;
1512 }
1513
1514 wxString wxMBConv_iconv::ms_wcCharsetName;
1515 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1516
1517 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1518               : m_name(name)
1519 {
1520     m_minMBCharWidth = 0;
1521
1522     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1523     // names for the charsets
1524     const wxCharBuffer cname(wxString(name).ToAscii());
1525
1526     // check for charset that represents wchar_t:
1527     if ( ms_wcCharsetName.empty() )
1528     {
1529         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1530
1531 #if wxUSE_FONTMAP
1532         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1533 #else // !wxUSE_FONTMAP
1534         static const wxChar *names[] =
1535         {
1536 #if SIZEOF_WCHAR_T == 4
1537             _T("UCS-4"),
1538 #elif SIZEOF_WCHAR_T = 2
1539             _T("UCS-2"),
1540 #endif
1541             NULL
1542         };
1543 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1544
1545         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1546         {
1547             const wxString nameCS(*names);
1548
1549             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1550             wxString nameXE(nameCS);
1551             #ifdef WORDS_BIGENDIAN
1552                 nameXE += _T("BE");
1553             #else // little endian
1554                 nameXE += _T("LE");
1555             #endif
1556
1557             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1558                        nameXE.c_str());
1559
1560             m2w = iconv_open(nameXE.ToAscii(), cname);
1561             if ( m2w == ICONV_T_INVALID )
1562             {
1563                 // try charset w/o bytesex info (e.g. "UCS4")
1564                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1565                            nameCS.c_str());
1566                 m2w = iconv_open(nameCS.ToAscii(), cname);
1567
1568                 // and check for bytesex ourselves:
1569                 if ( m2w != ICONV_T_INVALID )
1570                 {
1571                     char    buf[2], *bufPtr;
1572                     wchar_t wbuf[2], *wbufPtr;
1573                     size_t  insz, outsz;
1574                     size_t  res;
1575
1576                     buf[0] = 'A';
1577                     buf[1] = 0;
1578                     wbuf[0] = 0;
1579                     insz = 2;
1580                     outsz = SIZEOF_WCHAR_T * 2;
1581                     wbufPtr = wbuf;
1582                     bufPtr = buf;
1583
1584                     res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1585                                 (char**)&wbufPtr, &outsz);
1586
1587                     if (ICONV_FAILED(res, insz))
1588                     {
1589                         wxLogLastError(wxT("iconv"));
1590                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1591                                    nameCS.c_str());
1592                     }
1593                     else // ok, can convert to this encoding, remember it
1594                     {
1595                         ms_wcCharsetName = nameCS;
1596                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1597                     }
1598                 }
1599             }
1600             else // use charset not requiring byte swapping
1601             {
1602                 ms_wcCharsetName = nameXE;
1603             }
1604         }
1605
1606         wxLogTrace(TRACE_STRCONV,
1607                    wxT("iconv wchar_t charset is \"%s\"%s"),
1608                    ms_wcCharsetName.empty() ? _T("<none>")
1609                                             : ms_wcCharsetName.c_str(),
1610                    ms_wcNeedsSwap ? _T(" (needs swap)")
1611                                   : _T(""));
1612     }
1613     else // we already have ms_wcCharsetName
1614     {
1615         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1616     }
1617
1618     if ( ms_wcCharsetName.empty() )
1619     {
1620         w2m = ICONV_T_INVALID;
1621     }
1622     else
1623     {
1624         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1625         if ( w2m == ICONV_T_INVALID )
1626         {
1627             wxLogTrace(TRACE_STRCONV,
1628                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1629                        ms_wcCharsetName.c_str(), cname.data());
1630         }
1631     }
1632 }
1633
1634 wxMBConv_iconv::~wxMBConv_iconv()
1635 {
1636     if ( m2w != ICONV_T_INVALID )
1637         iconv_close(m2w);
1638     if ( w2m != ICONV_T_INVALID )
1639         iconv_close(w2m);
1640 }
1641
1642 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1643 {
1644     // find the string length: notice that must be done differently for
1645     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1646     size_t inbuf;
1647     const size_t nulLen = GetMBNulLen();
1648     switch ( nulLen )
1649     {
1650         default:
1651             return (size_t)-1;
1652
1653         case 1:
1654             inbuf = strlen(psz); // arguably more optimized than our version
1655             break;
1656
1657         case 2:
1658         case 4:
1659             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1660             // they also have to start at character boundary and not span two
1661             // adjacent characters
1662             const char *p;
1663             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1664                 ;
1665             inbuf = p - psz;
1666             break;
1667     }
1668
1669 #if wxUSE_THREADS
1670     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1671     //     Unfortunately there is a couple of global wxCSConv objects such as
1672     //     wxConvLocal that are used all over wx code, so we have to make sure
1673     //     the handle is used by at most one thread at the time. Otherwise
1674     //     only a few wx classes would be safe to use from non-main threads
1675     //     as MB<->WC conversion would fail "randomly".
1676     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1677 #endif // wxUSE_THREADS
1678
1679     size_t outbuf = n * SIZEOF_WCHAR_T;
1680     size_t res, cres;
1681     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1682     wchar_t *bufPtr = buf;
1683     const char *pszPtr = psz;
1684
1685     if (buf)
1686     {
1687         // have destination buffer, convert there
1688         cres = iconv(m2w,
1689                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1690                      (char**)&bufPtr, &outbuf);
1691         res = n - (outbuf / SIZEOF_WCHAR_T);
1692
1693         if (ms_wcNeedsSwap)
1694         {
1695             // convert to native endianness
1696             for ( unsigned i = 0; i < res; i++ )
1697                 buf[n] = WC_BSWAP(buf[i]);
1698         }
1699
1700         // NUL-terminate the string if there is any space left
1701         if (res < n)
1702             buf[res] = 0;
1703     }
1704     else
1705     {
1706         // no destination buffer... convert using temp buffer
1707         // to calculate destination buffer requirement
1708         wchar_t tbuf[8];
1709         res = 0;
1710
1711         do
1712         {
1713             bufPtr = tbuf;
1714             outbuf = 8 * SIZEOF_WCHAR_T;
1715
1716             cres = iconv(m2w,
1717                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1718                          (char**)&bufPtr, &outbuf );
1719
1720             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1721         }
1722         while ((cres == (size_t)-1) && (errno == E2BIG));
1723     }
1724
1725     if (ICONV_FAILED(cres, inbuf))
1726     {
1727         //VS: it is ok if iconv fails, hence trace only
1728         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1729         return (size_t)-1;
1730     }
1731
1732     return res;
1733 }
1734
1735 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1736 {
1737 #if wxUSE_THREADS
1738     // NB: explained in MB2WC
1739     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1740 #endif
1741
1742     size_t inlen = wxWcslen(psz);
1743     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1744     size_t outbuf = n;
1745     size_t res, cres;
1746
1747     wchar_t *tmpbuf = 0;
1748
1749     if (ms_wcNeedsSwap)
1750     {
1751         // need to copy to temp buffer to switch endianness
1752         // (doing WC_BSWAP twice on the original buffer won't help, as it
1753         //  could be in read-only memory, or be accessed in some other thread)
1754         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1755         for ( size_t i = 0; i < inlen; i++ )
1756             tmpbuf[n] = WC_BSWAP(psz[i]);
1757
1758         tmpbuf[inlen] = L'\0';
1759         psz = tmpbuf;
1760     }
1761
1762     if (buf)
1763     {
1764         // have destination buffer, convert there
1765         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1766
1767         res = n - outbuf;
1768
1769         // NB: iconv was given only wcslen(psz) characters on input, and so
1770         //     it couldn't convert the trailing zero. Let's do it ourselves
1771         //     if there's some room left for it in the output buffer.
1772         if (res < n)
1773             buf[0] = 0;
1774     }
1775     else
1776     {
1777         // no destination buffer... convert using temp buffer
1778         // to calculate destination buffer requirement
1779         char tbuf[16];
1780         res = 0;
1781         do
1782         {
1783             buf = tbuf;
1784             outbuf = 16;
1785
1786             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1787
1788             res += 16 - outbuf;
1789         }
1790         while ((cres == (size_t)-1) && (errno == E2BIG));
1791     }
1792
1793     if (ms_wcNeedsSwap)
1794     {
1795         free(tmpbuf);
1796     }
1797
1798     if (ICONV_FAILED(cres, inbuf))
1799     {
1800         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1801         return (size_t)-1;
1802     }
1803
1804     return res;
1805 }
1806
1807 size_t wxMBConv_iconv::GetMBNulLen() const
1808 {
1809     if ( m_minMBCharWidth == 0 )
1810     {
1811         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1812
1813 #if wxUSE_THREADS
1814         // NB: explained in MB2WC
1815         wxMutexLocker lock(self->m_iconvMutex);
1816 #endif
1817
1818         wchar_t *wnul = L"";
1819         char buf[8]; // should be enough for NUL in any encoding
1820         size_t inLen = sizeof(wchar_t),
1821                outLen = WXSIZEOF(buf);
1822         char *inBuff = (char *)wnul;
1823         char *outBuff = buf;
1824         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1825         {
1826             self->m_minMBCharWidth = (size_t)-1;
1827         }
1828         else // ok
1829         {
1830             self->m_minMBCharWidth = outBuff - buf;
1831         }
1832     }
1833
1834     return m_minMBCharWidth;
1835 }
1836
1837 #endif // HAVE_ICONV
1838
1839
1840 // ============================================================================
1841 // Win32 conversion classes
1842 // ============================================================================
1843
1844 #ifdef wxHAVE_WIN32_MB2WC
1845
1846 // from utils.cpp
1847 #if wxUSE_FONTMAP
1848 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1849 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1850 #endif
1851
1852 class wxMBConv_win32 : public wxMBConv
1853 {
1854 public:
1855     wxMBConv_win32()
1856     {
1857         m_CodePage = CP_ACP;
1858         m_minMBCharWidth = 0;
1859     }
1860
1861     wxMBConv_win32(const wxMBConv_win32& conv)
1862     {
1863         m_CodePage = conv.m_CodePage;
1864         m_minMBCharWidth = conv.m_minMBCharWidth;
1865     }
1866
1867 #if wxUSE_FONTMAP
1868     wxMBConv_win32(const wxChar* name)
1869     {
1870         m_CodePage = wxCharsetToCodepage(name);
1871         m_minMBCharWidth = 0;
1872     }
1873
1874     wxMBConv_win32(wxFontEncoding encoding)
1875     {
1876         m_CodePage = wxEncodingToCodepage(encoding);
1877         m_minMBCharWidth = 0;
1878     }
1879 #endif // wxUSE_FONTMAP
1880
1881     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1882     {
1883         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1884         // the behaviour is not compatible with the Unix version (using iconv)
1885         // and break the library itself, e.g. wxTextInputStream::NextChar()
1886         // wouldn't work if reading an incomplete MB char didn't result in an
1887         // error
1888         //
1889         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
1890         // Win XP or newer and it is not supported for UTF-[78] so we always
1891         // use our own conversions in this case. See
1892         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
1893         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
1894         if ( m_CodePage == CP_UTF8 )
1895         {
1896             return wxConvUTF8.MB2WC(buf, psz, n);
1897         }
1898
1899         if ( m_CodePage == CP_UTF7 )
1900         {
1901             return wxConvUTF7.MB2WC(buf, psz, n);
1902         }
1903
1904         int flags = 0;
1905         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
1906                 IsAtLeastWin2kSP4() )
1907         {
1908             flags = MB_ERR_INVALID_CHARS;
1909         }
1910
1911         const size_t len = ::MultiByteToWideChar
1912                              (
1913                                 m_CodePage,     // code page
1914                                 flags,          // flags: fall on error
1915                                 psz,            // input string
1916                                 -1,             // its length (NUL-terminated)
1917                                 buf,            // output string
1918                                 buf ? n : 0     // size of output buffer
1919                              );
1920         if ( !len )
1921         {
1922             // function totally failed
1923             return (size_t)-1;
1924         }
1925
1926         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
1927         // check if we succeeded, by doing a double trip:
1928         if ( !flags && buf )
1929         {
1930             const size_t mbLen = strlen(psz);
1931             wxCharBuffer mbBuf(mbLen);
1932             if ( ::WideCharToMultiByte
1933                    (
1934                       m_CodePage,
1935                       0,
1936                       buf,
1937                       -1,
1938                       mbBuf.data(),
1939                       mbLen + 1,        // size in bytes, not length
1940                       NULL,
1941                       NULL
1942                    ) == 0 ||
1943                   strcmp(mbBuf, psz) != 0 )
1944             {
1945                 // we didn't obtain the same thing we started from, hence
1946                 // the conversion was lossy and we consider that it failed
1947                 return (size_t)-1;
1948             }
1949         }
1950
1951         // note that it returns count of written chars for buf != NULL and size
1952         // of the needed buffer for buf == NULL so in either case the length of
1953         // the string (which never includes the terminating NUL) is one less
1954         return len - 1;
1955     }
1956
1957     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1958     {
1959         /*
1960             we have a problem here: by default, WideCharToMultiByte() may
1961             replace characters unrepresentable in the target code page with bad
1962             quality approximations such as turning "1/2" symbol (U+00BD) into
1963             "1" for the code pages which don't have it and we, obviously, want
1964             to avoid this at any price
1965
1966             the trouble is that this function does it _silently_, i.e. it won't
1967             even tell us whether it did or not... Win98/2000 and higher provide
1968             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1969             we have to resort to a round trip, i.e. check that converting back
1970             results in the same string -- this is, of course, expensive but
1971             otherwise we simply can't be sure to not garble the data.
1972          */
1973
1974         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1975         // it doesn't work with CJK encodings (which we test for rather roughly
1976         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1977         // supporting it
1978         BOOL usedDef wxDUMMY_INITIALIZE(false);
1979         BOOL *pUsedDef;
1980         int flags;
1981         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1982         {
1983             // it's our lucky day
1984             flags = WC_NO_BEST_FIT_CHARS;
1985             pUsedDef = &usedDef;
1986         }
1987         else // old system or unsupported encoding
1988         {
1989             flags = 0;
1990             pUsedDef = NULL;
1991         }
1992
1993         const size_t len = ::WideCharToMultiByte
1994                              (
1995                                 m_CodePage,     // code page
1996                                 flags,          // either none or no best fit
1997                                 pwz,            // input string
1998                                 -1,             // it is (wide) NUL-terminated
1999                                 buf,            // output buffer
2000                                 buf ? n : 0,    // and its size
2001                                 NULL,           // default "replacement" char
2002                                 pUsedDef        // [out] was it used?
2003                              );
2004
2005         if ( !len )
2006         {
2007             // function totally failed
2008             return (size_t)-1;
2009         }
2010
2011         // if we were really converting, check if we succeeded
2012         if ( buf )
2013         {
2014             if ( flags )
2015             {
2016                 // check if the conversion failed, i.e. if any replacements
2017                 // were done
2018                 if ( usedDef )
2019                     return (size_t)-1;
2020             }
2021             else // we must resort to double tripping...
2022             {
2023                 wxWCharBuffer wcBuf(n);
2024                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2025                         wcscmp(wcBuf, pwz) != 0 )
2026                 {
2027                     // we didn't obtain the same thing we started from, hence
2028                     // the conversion was lossy and we consider that it failed
2029                     return (size_t)-1;
2030                 }
2031             }
2032         }
2033
2034         // see the comment above for the reason of "len - 1"
2035         return len - 1;
2036     }
2037
2038     virtual size_t GetMBNulLen() const
2039     {
2040         if ( m_minMBCharWidth == 0 )
2041         {
2042             int len = ::WideCharToMultiByte
2043                         (
2044                             m_CodePage,     // code page
2045                             0,              // no flags
2046                             L"",            // input string
2047                             1,              // translate just the NUL
2048                             NULL,           // output buffer
2049                             0,              // and its size
2050                             NULL,           // no replacement char
2051                             NULL            // [out] don't care if it was used
2052                         );
2053
2054             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2055             switch ( len )
2056             {
2057                 default:
2058                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2059                     self->m_minMBCharWidth = (size_t)-1;
2060                     break;
2061
2062                 case 0:
2063                     self->m_minMBCharWidth = (size_t)-1;
2064                     break;
2065
2066                 case 1:
2067                 case 2:
2068                 case 4:
2069                     self->m_minMBCharWidth = len;
2070                     break;
2071             }
2072         }
2073
2074         return m_minMBCharWidth;
2075     }
2076
2077     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2078
2079     bool IsOk() const { return m_CodePage != -1; }
2080
2081 private:
2082     static bool CanUseNoBestFit()
2083     {
2084         static int s_isWin98Or2k = -1;
2085
2086         if ( s_isWin98Or2k == -1 )
2087         {
2088             int verMaj, verMin;
2089             switch ( wxGetOsVersion(&verMaj, &verMin) )
2090             {
2091                 case wxWIN95:
2092                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2093                     break;
2094
2095                 case wxWINDOWS_NT:
2096                     s_isWin98Or2k = verMaj >= 5;
2097                     break;
2098
2099                 default:
2100                     // unknown, be conservative by default
2101                     s_isWin98Or2k = 0;
2102                     break;
2103             }
2104
2105             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2106         }
2107
2108         return s_isWin98Or2k == 1;
2109     }
2110
2111     static bool IsAtLeastWin2kSP4()
2112     {
2113 #ifdef __WXWINCE__
2114         return false;
2115 #else
2116         static int s_isAtLeastWin2kSP4 = -1;
2117
2118         if ( s_isAtLeastWin2kSP4 == -1 )
2119         {
2120             OSVERSIONINFOEX ver;
2121
2122             memset(&ver, 0, sizeof(ver));
2123             ver.dwOSVersionInfoSize = sizeof(ver);
2124             GetVersionEx((OSVERSIONINFO*)&ver);
2125
2126             s_isAtLeastWin2kSP4 =
2127               ((ver.dwMajorVersion > 5) || // Vista+
2128                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2129                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2130                ver.wServicePackMajor >= 4)) // 2000 SP4+
2131               ? 1 : 0;
2132         }
2133
2134         return s_isAtLeastWin2kSP4 == 1;
2135 #endif
2136     }
2137
2138
2139     // the code page we're working with
2140     long m_CodePage;
2141
2142     // cached result of GetMBNulLen(), set to 0 initially meaning
2143     // "unknown"
2144     size_t m_minMBCharWidth;
2145 };
2146
2147 #endif // wxHAVE_WIN32_MB2WC
2148
2149 // ============================================================================
2150 // Cocoa conversion classes
2151 // ============================================================================
2152
2153 #if defined(__WXCOCOA__)
2154
2155 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2156 // Strangely enough, internally Core Foundation uses
2157 // UTF 32 internally quite a bit - its just not public (yet).
2158
2159 #include <CoreFoundation/CFString.h>
2160 #include <CoreFoundation/CFStringEncodingExt.h>
2161
2162 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2163 {
2164     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2165
2166     switch (encoding)
2167     {
2168         case wxFONTENCODING_DEFAULT :
2169             enc = CFStringGetSystemEncoding();
2170             break ;
2171
2172         case wxFONTENCODING_ISO8859_1 :
2173             enc = kCFStringEncodingISOLatin1 ;
2174             break ;
2175         case wxFONTENCODING_ISO8859_2 :
2176             enc = kCFStringEncodingISOLatin2;
2177             break ;
2178         case wxFONTENCODING_ISO8859_3 :
2179             enc = kCFStringEncodingISOLatin3 ;
2180             break ;
2181         case wxFONTENCODING_ISO8859_4 :
2182             enc = kCFStringEncodingISOLatin4;
2183             break ;
2184         case wxFONTENCODING_ISO8859_5 :
2185             enc = kCFStringEncodingISOLatinCyrillic;
2186             break ;
2187         case wxFONTENCODING_ISO8859_6 :
2188             enc = kCFStringEncodingISOLatinArabic;
2189             break ;
2190         case wxFONTENCODING_ISO8859_7 :
2191             enc = kCFStringEncodingISOLatinGreek;
2192             break ;
2193         case wxFONTENCODING_ISO8859_8 :
2194             enc = kCFStringEncodingISOLatinHebrew;
2195             break ;
2196         case wxFONTENCODING_ISO8859_9 :
2197             enc = kCFStringEncodingISOLatin5;
2198             break ;
2199         case wxFONTENCODING_ISO8859_10 :
2200             enc = kCFStringEncodingISOLatin6;
2201             break ;
2202         case wxFONTENCODING_ISO8859_11 :
2203             enc = kCFStringEncodingISOLatinThai;
2204             break ;
2205         case wxFONTENCODING_ISO8859_13 :
2206             enc = kCFStringEncodingISOLatin7;
2207             break ;
2208         case wxFONTENCODING_ISO8859_14 :
2209             enc = kCFStringEncodingISOLatin8;
2210             break ;
2211         case wxFONTENCODING_ISO8859_15 :
2212             enc = kCFStringEncodingISOLatin9;
2213             break ;
2214
2215         case wxFONTENCODING_KOI8 :
2216             enc = kCFStringEncodingKOI8_R;
2217             break ;
2218         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2219             enc = kCFStringEncodingDOSRussian;
2220             break ;
2221
2222 //      case wxFONTENCODING_BULGARIAN :
2223 //          enc = ;
2224 //          break ;
2225
2226         case wxFONTENCODING_CP437 :
2227             enc = kCFStringEncodingDOSLatinUS ;
2228             break ;
2229         case wxFONTENCODING_CP850 :
2230             enc = kCFStringEncodingDOSLatin1;
2231             break ;
2232         case wxFONTENCODING_CP852 :
2233             enc = kCFStringEncodingDOSLatin2;
2234             break ;
2235         case wxFONTENCODING_CP855 :
2236             enc = kCFStringEncodingDOSCyrillic;
2237             break ;
2238         case wxFONTENCODING_CP866 :
2239             enc = kCFStringEncodingDOSRussian ;
2240             break ;
2241         case wxFONTENCODING_CP874 :
2242             enc = kCFStringEncodingDOSThai;
2243             break ;
2244         case wxFONTENCODING_CP932 :
2245             enc = kCFStringEncodingDOSJapanese;
2246             break ;
2247         case wxFONTENCODING_CP936 :
2248             enc = kCFStringEncodingDOSChineseSimplif ;
2249             break ;
2250         case wxFONTENCODING_CP949 :
2251             enc = kCFStringEncodingDOSKorean;
2252             break ;
2253         case wxFONTENCODING_CP950 :
2254             enc = kCFStringEncodingDOSChineseTrad;
2255             break ;
2256         case wxFONTENCODING_CP1250 :
2257             enc = kCFStringEncodingWindowsLatin2;
2258             break ;
2259         case wxFONTENCODING_CP1251 :
2260             enc = kCFStringEncodingWindowsCyrillic ;
2261             break ;
2262         case wxFONTENCODING_CP1252 :
2263             enc = kCFStringEncodingWindowsLatin1 ;
2264             break ;
2265         case wxFONTENCODING_CP1253 :
2266             enc = kCFStringEncodingWindowsGreek;
2267             break ;
2268         case wxFONTENCODING_CP1254 :
2269             enc = kCFStringEncodingWindowsLatin5;
2270             break ;
2271         case wxFONTENCODING_CP1255 :
2272             enc = kCFStringEncodingWindowsHebrew ;
2273             break ;
2274         case wxFONTENCODING_CP1256 :
2275             enc = kCFStringEncodingWindowsArabic ;
2276             break ;
2277         case wxFONTENCODING_CP1257 :
2278             enc = kCFStringEncodingWindowsBalticRim;
2279             break ;
2280 //   This only really encodes to UTF7 (if that) evidently
2281 //        case wxFONTENCODING_UTF7 :
2282 //            enc = kCFStringEncodingNonLossyASCII ;
2283 //            break ;
2284         case wxFONTENCODING_UTF8 :
2285             enc = kCFStringEncodingUTF8 ;
2286             break ;
2287         case wxFONTENCODING_EUC_JP :
2288             enc = kCFStringEncodingEUC_JP;
2289             break ;
2290         case wxFONTENCODING_UTF16 :
2291             enc = kCFStringEncodingUnicode ;
2292             break ;
2293         case wxFONTENCODING_MACROMAN :
2294             enc = kCFStringEncodingMacRoman ;
2295             break ;
2296         case wxFONTENCODING_MACJAPANESE :
2297             enc = kCFStringEncodingMacJapanese ;
2298             break ;
2299         case wxFONTENCODING_MACCHINESETRAD :
2300             enc = kCFStringEncodingMacChineseTrad ;
2301             break ;
2302         case wxFONTENCODING_MACKOREAN :
2303             enc = kCFStringEncodingMacKorean ;
2304             break ;
2305         case wxFONTENCODING_MACARABIC :
2306             enc = kCFStringEncodingMacArabic ;
2307             break ;
2308         case wxFONTENCODING_MACHEBREW :
2309             enc = kCFStringEncodingMacHebrew ;
2310             break ;
2311         case wxFONTENCODING_MACGREEK :
2312             enc = kCFStringEncodingMacGreek ;
2313             break ;
2314         case wxFONTENCODING_MACCYRILLIC :
2315             enc = kCFStringEncodingMacCyrillic ;
2316             break ;
2317         case wxFONTENCODING_MACDEVANAGARI :
2318             enc = kCFStringEncodingMacDevanagari ;
2319             break ;
2320         case wxFONTENCODING_MACGURMUKHI :
2321             enc = kCFStringEncodingMacGurmukhi ;
2322             break ;
2323         case wxFONTENCODING_MACGUJARATI :
2324             enc = kCFStringEncodingMacGujarati ;
2325             break ;
2326         case wxFONTENCODING_MACORIYA :
2327             enc = kCFStringEncodingMacOriya ;
2328             break ;
2329         case wxFONTENCODING_MACBENGALI :
2330             enc = kCFStringEncodingMacBengali ;
2331             break ;
2332         case wxFONTENCODING_MACTAMIL :
2333             enc = kCFStringEncodingMacTamil ;
2334             break ;
2335         case wxFONTENCODING_MACTELUGU :
2336             enc = kCFStringEncodingMacTelugu ;
2337             break ;
2338         case wxFONTENCODING_MACKANNADA :
2339             enc = kCFStringEncodingMacKannada ;
2340             break ;
2341         case wxFONTENCODING_MACMALAJALAM :
2342             enc = kCFStringEncodingMacMalayalam ;
2343             break ;
2344         case wxFONTENCODING_MACSINHALESE :
2345             enc = kCFStringEncodingMacSinhalese ;
2346             break ;
2347         case wxFONTENCODING_MACBURMESE :
2348             enc = kCFStringEncodingMacBurmese ;
2349             break ;
2350         case wxFONTENCODING_MACKHMER :
2351             enc = kCFStringEncodingMacKhmer ;
2352             break ;
2353         case wxFONTENCODING_MACTHAI :
2354             enc = kCFStringEncodingMacThai ;
2355             break ;
2356         case wxFONTENCODING_MACLAOTIAN :
2357             enc = kCFStringEncodingMacLaotian ;
2358             break ;
2359         case wxFONTENCODING_MACGEORGIAN :
2360             enc = kCFStringEncodingMacGeorgian ;
2361             break ;
2362         case wxFONTENCODING_MACARMENIAN :
2363             enc = kCFStringEncodingMacArmenian ;
2364             break ;
2365         case wxFONTENCODING_MACCHINESESIMP :
2366             enc = kCFStringEncodingMacChineseSimp ;
2367             break ;
2368         case wxFONTENCODING_MACTIBETAN :
2369             enc = kCFStringEncodingMacTibetan ;
2370             break ;
2371         case wxFONTENCODING_MACMONGOLIAN :
2372             enc = kCFStringEncodingMacMongolian ;
2373             break ;
2374         case wxFONTENCODING_MACETHIOPIC :
2375             enc = kCFStringEncodingMacEthiopic ;
2376             break ;
2377         case wxFONTENCODING_MACCENTRALEUR :
2378             enc = kCFStringEncodingMacCentralEurRoman ;
2379             break ;
2380         case wxFONTENCODING_MACVIATNAMESE :
2381             enc = kCFStringEncodingMacVietnamese ;
2382             break ;
2383         case wxFONTENCODING_MACARABICEXT :
2384             enc = kCFStringEncodingMacExtArabic ;
2385             break ;
2386         case wxFONTENCODING_MACSYMBOL :
2387             enc = kCFStringEncodingMacSymbol ;
2388             break ;
2389         case wxFONTENCODING_MACDINGBATS :
2390             enc = kCFStringEncodingMacDingbats ;
2391             break ;
2392         case wxFONTENCODING_MACTURKISH :
2393             enc = kCFStringEncodingMacTurkish ;
2394             break ;
2395         case wxFONTENCODING_MACCROATIAN :
2396             enc = kCFStringEncodingMacCroatian ;
2397             break ;
2398         case wxFONTENCODING_MACICELANDIC :
2399             enc = kCFStringEncodingMacIcelandic ;
2400             break ;
2401         case wxFONTENCODING_MACROMANIAN :
2402             enc = kCFStringEncodingMacRomanian ;
2403             break ;
2404         case wxFONTENCODING_MACCELTIC :
2405             enc = kCFStringEncodingMacCeltic ;
2406             break ;
2407         case wxFONTENCODING_MACGAELIC :
2408             enc = kCFStringEncodingMacGaelic ;
2409             break ;
2410 //      case wxFONTENCODING_MACKEYBOARD :
2411 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2412 //          break ;
2413
2414         default :
2415             // because gcc is picky
2416             break ;
2417     }
2418
2419     return enc ;
2420 }
2421
2422 class wxMBConv_cocoa : public wxMBConv
2423 {
2424 public:
2425     wxMBConv_cocoa()
2426     {
2427         Init(CFStringGetSystemEncoding()) ;
2428     }
2429
2430     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2431     {
2432         m_encoding = conv.m_encoding;
2433     }
2434
2435 #if wxUSE_FONTMAP
2436     wxMBConv_cocoa(const wxChar* name)
2437     {
2438         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2439     }
2440 #endif
2441
2442     wxMBConv_cocoa(wxFontEncoding encoding)
2443     {
2444         Init( wxCFStringEncFromFontEnc(encoding) );
2445     }
2446
2447     ~wxMBConv_cocoa()
2448     {
2449     }
2450
2451     void Init( CFStringEncoding encoding)
2452     {
2453         m_encoding = encoding ;
2454     }
2455
2456     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2457     {
2458         wxASSERT(szUnConv);
2459
2460         CFStringRef theString = CFStringCreateWithBytes (
2461                                                 NULL, //the allocator
2462                                                 (const UInt8*)szUnConv,
2463                                                 strlen(szUnConv),
2464                                                 m_encoding,
2465                                                 false //no BOM/external representation
2466                                                 );
2467
2468         wxASSERT(theString);
2469
2470         size_t nOutLength = CFStringGetLength(theString);
2471
2472         if (szOut == NULL)
2473         {
2474             CFRelease(theString);
2475             return nOutLength;
2476         }
2477
2478         CFRange theRange = { 0, nOutSize };
2479
2480 #if SIZEOF_WCHAR_T == 4
2481         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2482 #endif
2483
2484         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2485
2486         CFRelease(theString);
2487
2488         szUniCharBuffer[nOutLength] = '\0' ;
2489
2490 #if SIZEOF_WCHAR_T == 4
2491         wxMBConvUTF16 converter ;
2492         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
2493         delete[] szUniCharBuffer;
2494 #endif
2495
2496         return nOutLength;
2497     }
2498
2499     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2500     {
2501         wxASSERT(szUnConv);
2502
2503         size_t nRealOutSize;
2504         size_t nBufSize = wxWcslen(szUnConv);
2505         UniChar* szUniBuffer = (UniChar*) szUnConv;
2506
2507 #if SIZEOF_WCHAR_T == 4
2508         wxMBConvUTF16 converter ;
2509         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2510         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2511         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2512         nBufSize /= sizeof(UniChar);
2513 #endif
2514
2515         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2516                                 NULL, //allocator
2517                                 szUniBuffer,
2518                                 nBufSize,
2519                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2520                             );
2521
2522         wxASSERT(theString);
2523
2524         //Note that CER puts a BOM when converting to unicode
2525         //so we  check and use getchars instead in that case
2526         if (m_encoding == kCFStringEncodingUnicode)
2527         {
2528             if (szOut != NULL)
2529                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2530
2531             nRealOutSize = CFStringGetLength(theString) + 1;
2532         }
2533         else
2534         {
2535             CFStringGetBytes(
2536                 theString,
2537                 CFRangeMake(0, CFStringGetLength(theString)),
2538                 m_encoding,
2539                 0, //what to put in characters that can't be converted -
2540                     //0 tells CFString to return NULL if it meets such a character
2541                 false, //not an external representation
2542                 (UInt8*) szOut,
2543                 nOutSize,
2544                 (CFIndex*) &nRealOutSize
2545                         );
2546         }
2547
2548         CFRelease(theString);
2549
2550 #if SIZEOF_WCHAR_T == 4
2551         delete[] szUniBuffer;
2552 #endif
2553
2554         return  nRealOutSize - 1;
2555     }
2556
2557     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2558
2559     bool IsOk() const
2560     {
2561         return m_encoding != kCFStringEncodingInvalidId &&
2562               CFStringIsEncodingAvailable(m_encoding);
2563     }
2564
2565 private:
2566     CFStringEncoding m_encoding ;
2567 };
2568
2569 #endif // defined(__WXCOCOA__)
2570
2571 // ============================================================================
2572 // Mac conversion classes
2573 // ============================================================================
2574
2575 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2576
2577 class wxMBConv_mac : public wxMBConv
2578 {
2579 public:
2580     wxMBConv_mac()
2581     {
2582         Init(CFStringGetSystemEncoding()) ;
2583     }
2584
2585     wxMBConv_mac(const wxMBConv_mac& conv)
2586     {
2587         Init(conv.m_char_encoding);
2588     }
2589
2590 #if wxUSE_FONTMAP
2591     wxMBConv_mac(const wxChar* name)
2592     {
2593         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2594     }
2595 #endif
2596
2597     wxMBConv_mac(wxFontEncoding encoding)
2598     {
2599         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2600     }
2601
2602     ~wxMBConv_mac()
2603     {
2604         OSStatus status = noErr ;
2605         status = TECDisposeConverter(m_MB2WC_converter);
2606         status = TECDisposeConverter(m_WC2MB_converter);
2607     }
2608
2609
2610     void Init( TextEncodingBase encoding)
2611     {
2612         OSStatus status = noErr ;
2613         m_char_encoding = encoding ;
2614         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2615
2616         status = TECCreateConverter(&m_MB2WC_converter,
2617                                     m_char_encoding,
2618                                     m_unicode_encoding);
2619         status = TECCreateConverter(&m_WC2MB_converter,
2620                                     m_unicode_encoding,
2621                                     m_char_encoding);
2622     }
2623
2624     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2625     {
2626         OSStatus status = noErr ;
2627         ByteCount byteOutLen ;
2628         ByteCount byteInLen = strlen(psz) ;
2629         wchar_t *tbuf = NULL ;
2630         UniChar* ubuf = NULL ;
2631         size_t res = 0 ;
2632
2633         if (buf == NULL)
2634         {
2635             //apple specs say at least 32
2636             n = wxMax( 32 , byteInLen ) ;
2637             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2638         }
2639
2640         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2641
2642 #if SIZEOF_WCHAR_T == 4
2643         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2644 #else
2645         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2646 #endif
2647         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2648           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2649 #if SIZEOF_WCHAR_T == 4
2650         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2651         // is not properly terminated we get random characters at the end
2652         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2653         wxMBConvUTF16 converter ;
2654         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2655         free( ubuf ) ;
2656 #else
2657         res = byteOutLen / sizeof( UniChar ) ;
2658 #endif
2659
2660         if ( buf == NULL )
2661              free(tbuf) ;
2662
2663         if ( buf  && res < n)
2664             buf[res] = 0;
2665
2666         return res ;
2667     }
2668
2669     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2670     {
2671         OSStatus status = noErr ;
2672         ByteCount byteOutLen ;
2673         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2674
2675         char *tbuf = NULL ;
2676
2677         if (buf == NULL)
2678         {
2679             //apple specs say at least 32
2680             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2681             tbuf = (char*) malloc( n ) ;
2682         }
2683
2684         ByteCount byteBufferLen = n ;
2685         UniChar* ubuf = NULL ;
2686
2687 #if SIZEOF_WCHAR_T == 4
2688         wxMBConvUTF16 converter ;
2689         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2690         byteInLen = unicharlen ;
2691         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2692         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2693 #else
2694         ubuf = (UniChar*) psz ;
2695 #endif
2696
2697         status = TECConvertText(
2698             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2699             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2700
2701 #if SIZEOF_WCHAR_T == 4
2702         free( ubuf ) ;
2703 #endif
2704
2705         if ( buf == NULL )
2706             free(tbuf) ;
2707
2708         size_t res = byteOutLen ;
2709         if ( buf  && res < n)
2710         {
2711             buf[res] = 0;
2712
2713             //we need to double-trip to verify it didn't insert any ? in place
2714             //of bogus characters
2715             wxWCharBuffer wcBuf(n);
2716             size_t pszlen = wxWcslen(psz);
2717             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2718                         wxWcslen(wcBuf) != pszlen ||
2719                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2720             {
2721                 // we didn't obtain the same thing we started from, hence
2722                 // the conversion was lossy and we consider that it failed
2723                 return (size_t)-1;
2724             }
2725         }
2726
2727         return res ;
2728     }
2729
2730     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2731
2732     bool IsOk() const
2733         { return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL; }
2734
2735 private:
2736     TECObjectRef m_MB2WC_converter;
2737     TECObjectRef m_WC2MB_converter;
2738
2739     TextEncodingBase m_char_encoding;
2740     TextEncodingBase m_unicode_encoding;
2741 };
2742
2743 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2744
2745 // ============================================================================
2746 // wxEncodingConverter based conversion classes
2747 // ============================================================================
2748
2749 #if wxUSE_FONTMAP
2750
2751 class wxMBConv_wxwin : public wxMBConv
2752 {
2753 private:
2754     void Init()
2755     {
2756         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2757                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2758     }
2759
2760 public:
2761     // temporarily just use wxEncodingConverter stuff,
2762     // so that it works while a better implementation is built
2763     wxMBConv_wxwin(const wxChar* name)
2764     {
2765         if (name)
2766             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2767         else
2768             m_enc = wxFONTENCODING_SYSTEM;
2769
2770         Init();
2771     }
2772
2773     wxMBConv_wxwin(wxFontEncoding enc)
2774     {
2775         m_enc = enc;
2776
2777         Init();
2778     }
2779
2780     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2781     {
2782         size_t inbuf = strlen(psz);
2783         if (buf)
2784         {
2785             if (!m2w.Convert(psz, buf))
2786                 return (size_t)-1;
2787         }
2788         return inbuf;
2789     }
2790
2791     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2792     {
2793         const size_t inbuf = wxWcslen(psz);
2794         if (buf)
2795         {
2796             if (!w2m.Convert(psz, buf))
2797                 return (size_t)-1;
2798         }
2799
2800         return inbuf;
2801     }
2802
2803     virtual size_t GetMBNulLen() const
2804     {
2805         switch ( m_enc )
2806         {
2807             case wxFONTENCODING_UTF16BE:
2808             case wxFONTENCODING_UTF16LE:
2809                 return 2;
2810
2811             case wxFONTENCODING_UTF32BE:
2812             case wxFONTENCODING_UTF32LE:
2813                 return 4;
2814
2815             default:
2816                 return 1;
2817         }
2818     }
2819
2820     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2821
2822     bool IsOk() const { return m_ok; }
2823
2824 public:
2825     wxFontEncoding m_enc;
2826     wxEncodingConverter m2w, w2m;
2827
2828 private:
2829     // were we initialized successfully?
2830     bool m_ok;
2831
2832     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2833 };
2834
2835 // make the constructors available for unit testing
2836 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
2837 {
2838     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2839     if ( !result->IsOk() )
2840     {
2841         delete result;
2842         return 0;
2843     }
2844
2845     return result;
2846 }
2847
2848 #endif // wxUSE_FONTMAP
2849
2850 // ============================================================================
2851 // wxCSConv implementation
2852 // ============================================================================
2853
2854 void wxCSConv::Init()
2855 {
2856     m_name = NULL;
2857     m_convReal =  NULL;
2858     m_deferred = true;
2859 }
2860
2861 wxCSConv::wxCSConv(const wxChar *charset)
2862 {
2863     Init();
2864
2865     if ( charset )
2866     {
2867         SetName(charset);
2868     }
2869
2870 #if wxUSE_FONTMAP
2871     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2872 #else
2873     m_encoding = wxFONTENCODING_SYSTEM;
2874 #endif
2875 }
2876
2877 wxCSConv::wxCSConv(wxFontEncoding encoding)
2878 {
2879     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2880     {
2881         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2882
2883         encoding = wxFONTENCODING_SYSTEM;
2884     }
2885
2886     Init();
2887
2888     m_encoding = encoding;
2889 }
2890
2891 wxCSConv::~wxCSConv()
2892 {
2893     Clear();
2894 }
2895
2896 wxCSConv::wxCSConv(const wxCSConv& conv)
2897         : wxMBConv()
2898 {
2899     Init();
2900
2901     SetName(conv.m_name);
2902     m_encoding = conv.m_encoding;
2903 }
2904
2905 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2906 {
2907     Clear();
2908
2909     SetName(conv.m_name);
2910     m_encoding = conv.m_encoding;
2911
2912     return *this;
2913 }
2914
2915 void wxCSConv::Clear()
2916 {
2917     free(m_name);
2918     delete m_convReal;
2919
2920     m_name = NULL;
2921     m_convReal = NULL;
2922 }
2923
2924 void wxCSConv::SetName(const wxChar *charset)
2925 {
2926     if (charset)
2927     {
2928         m_name = wxStrdup(charset);
2929         m_deferred = true;
2930     }
2931 }
2932
2933 #if wxUSE_FONTMAP
2934 #include "wx/hashmap.h"
2935
2936 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2937                      wxEncodingNameCache );
2938
2939 static wxEncodingNameCache gs_nameCache;
2940 #endif
2941
2942 wxMBConv *wxCSConv::DoCreate() const
2943 {
2944 #if wxUSE_FONTMAP
2945     wxLogTrace(TRACE_STRCONV,
2946                wxT("creating conversion for %s"),
2947                (m_name ? m_name
2948                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
2949 #endif // wxUSE_FONTMAP
2950
2951     // check for the special case of ASCII or ISO8859-1 charset: as we have
2952     // special knowledge of it anyhow, we don't need to create a special
2953     // conversion object
2954     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2955             m_encoding == wxFONTENCODING_DEFAULT )
2956     {
2957         // don't convert at all
2958         return NULL;
2959     }
2960
2961     // we trust OS to do conversion better than we can so try external
2962     // conversion methods first
2963     //
2964     // the full order is:
2965     //      1. OS conversion (iconv() under Unix or Win32 API)
2966     //      2. hard coded conversions for UTF
2967     //      3. wxEncodingConverter as fall back
2968
2969     // step (1)
2970 #ifdef HAVE_ICONV
2971 #if !wxUSE_FONTMAP
2972     if ( m_name )
2973 #endif // !wxUSE_FONTMAP
2974     {
2975         wxString name(m_name);
2976         wxFontEncoding encoding(m_encoding);
2977
2978         if ( !name.empty() )
2979         {
2980             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2981             if ( conv->IsOk() )
2982                 return conv;
2983
2984             delete conv;
2985
2986 #if wxUSE_FONTMAP
2987             encoding =
2988                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2989 #endif // wxUSE_FONTMAP
2990         }
2991 #if wxUSE_FONTMAP
2992         {
2993             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2994             if ( it != gs_nameCache.end() )
2995             {
2996                 if ( it->second.empty() )
2997                     return NULL;
2998
2999                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3000                 if ( conv->IsOk() )
3001                     return conv;
3002
3003                 delete conv;
3004             }
3005
3006             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3007
3008             for ( ; *names; ++names )
3009             {
3010                 wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3011                 if ( conv->IsOk() )
3012                 {
3013                     gs_nameCache[encoding] = *names;
3014                     return conv;
3015                 }
3016
3017                 delete conv;
3018             }
3019
3020             gs_nameCache[encoding] = _T(""); // cache the failure
3021         }
3022 #endif // wxUSE_FONTMAP
3023     }
3024 #endif // HAVE_ICONV
3025
3026 #ifdef wxHAVE_WIN32_MB2WC
3027     {
3028 #if wxUSE_FONTMAP
3029         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3030                                       : new wxMBConv_win32(m_encoding);
3031         if ( conv->IsOk() )
3032             return conv;
3033
3034         delete conv;
3035 #else
3036         return NULL;
3037 #endif
3038     }
3039 #endif // wxHAVE_WIN32_MB2WC
3040
3041 #if defined(__WXMAC__)
3042     {
3043         // leave UTF16 and UTF32 to the built-ins of wx
3044         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3045             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3046         {
3047 #if wxUSE_FONTMAP
3048             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3049                                         : new wxMBConv_mac(m_encoding);
3050 #else
3051             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3052 #endif
3053             if ( conv->IsOk() )
3054                  return conv;
3055
3056             delete conv;
3057         }
3058     }
3059 #endif
3060
3061 #if defined(__WXCOCOA__)
3062     {
3063         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3064         {
3065 #if wxUSE_FONTMAP
3066             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3067                                           : new wxMBConv_cocoa(m_encoding);
3068 #else
3069             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3070 #endif
3071
3072             if ( conv->IsOk() )
3073                  return conv;
3074
3075             delete conv;
3076         }
3077     }
3078 #endif
3079     // step (2)
3080     wxFontEncoding enc = m_encoding;
3081 #if wxUSE_FONTMAP
3082     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3083     {
3084         // use "false" to suppress interactive dialogs -- we can be called from
3085         // anywhere and popping up a dialog from here is the last thing we want to
3086         // do
3087         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3088     }
3089 #endif // wxUSE_FONTMAP
3090
3091     switch ( enc )
3092     {
3093         case wxFONTENCODING_UTF7:
3094              return new wxMBConvUTF7;
3095
3096         case wxFONTENCODING_UTF8:
3097              return new wxMBConvUTF8;
3098
3099         case wxFONTENCODING_UTF16BE:
3100              return new wxMBConvUTF16BE;
3101
3102         case wxFONTENCODING_UTF16LE:
3103              return new wxMBConvUTF16LE;
3104
3105         case wxFONTENCODING_UTF32BE:
3106              return new wxMBConvUTF32BE;
3107
3108         case wxFONTENCODING_UTF32LE:
3109              return new wxMBConvUTF32LE;
3110
3111         default:
3112              // nothing to do but put here to suppress gcc warnings
3113              break;
3114     }
3115
3116     // step (3)
3117 #if wxUSE_FONTMAP
3118     {
3119         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3120                                       : new wxMBConv_wxwin(m_encoding);
3121         if ( conv->IsOk() )
3122             return conv;
3123
3124         delete conv;
3125     }
3126 #endif // wxUSE_FONTMAP
3127
3128     // NB: This is a hack to prevent deadlock. What could otherwise happen
3129     //     in Unicode build: wxConvLocal creation ends up being here
3130     //     because of some failure and logs the error. But wxLog will try to
3131     //     attach timestamp, for which it will need wxConvLocal (to convert
3132     //     time to char* and then wchar_t*), but that fails, tries to log
3133     //     error, but wxLog has a (already locked) critical section that
3134     //     guards static buffer.
3135     static bool alreadyLoggingError = false;
3136     if (!alreadyLoggingError)
3137     {
3138         alreadyLoggingError = true;
3139         wxLogError(_("Cannot convert from the charset '%s'!"),
3140                    m_name ? m_name
3141                       :
3142 #if wxUSE_FONTMAP
3143                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3144 #else // !wxUSE_FONTMAP
3145                          wxString::Format(_("encoding %s"), m_encoding).c_str()
3146 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3147               );
3148
3149         alreadyLoggingError = false;
3150     }
3151
3152     return NULL;
3153 }
3154
3155 void wxCSConv::CreateConvIfNeeded() const
3156 {
3157     if ( m_deferred )
3158     {
3159         wxCSConv *self = (wxCSConv *)this; // const_cast
3160
3161 #if wxUSE_INTL
3162         // if we don't have neither the name nor the encoding, use the default
3163         // encoding for this system
3164         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3165         {
3166             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3167         }
3168 #endif // wxUSE_INTL
3169
3170         self->m_convReal = DoCreate();
3171         self->m_deferred = false;
3172     }
3173 }
3174
3175 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3176 {
3177     CreateConvIfNeeded();
3178
3179     if (m_convReal)
3180         return m_convReal->MB2WC(buf, psz, n);
3181
3182     // latin-1 (direct)
3183     size_t len = strlen(psz);
3184
3185     if (buf)
3186     {
3187         for (size_t c = 0; c <= len; c++)
3188             buf[c] = (unsigned char)(psz[c]);
3189     }
3190
3191     return len;
3192 }
3193
3194 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3195 {
3196     CreateConvIfNeeded();
3197
3198     if (m_convReal)
3199         return m_convReal->WC2MB(buf, psz, n);
3200
3201     // latin-1 (direct)
3202     const size_t len = wxWcslen(psz);
3203     if (buf)
3204     {
3205         for (size_t c = 0; c <= len; c++)
3206         {
3207             if (psz[c] > 0xFF)
3208                 return (size_t)-1;
3209
3210             buf[c] = (char)psz[c];
3211         }
3212     }
3213     else
3214     {
3215         for (size_t c = 0; c <= len; c++)
3216         {
3217             if (psz[c] > 0xFF)
3218                 return (size_t)-1;
3219         }
3220     }
3221
3222     return len;
3223 }
3224
3225 size_t wxCSConv::GetMBNulLen() const
3226 {
3227     CreateConvIfNeeded();
3228
3229     if ( m_convReal )
3230     {
3231         return m_convReal->GetMBNulLen();
3232     }
3233
3234     return 1;
3235 }
3236
3237 // ----------------------------------------------------------------------------
3238 // globals
3239 // ----------------------------------------------------------------------------
3240
3241 #ifdef __WINDOWS__
3242     static wxMBConv_win32 wxConvLibcObj;
3243 #elif defined(__WXMAC__) && !defined(__MACH__)
3244     static wxMBConv_mac wxConvLibcObj ;
3245 #else
3246     static wxMBConvLibc wxConvLibcObj;
3247 #endif
3248
3249 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3250 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3251 static wxMBConvUTF7 wxConvUTF7Obj;
3252 static wxMBConvUTF8 wxConvUTF8Obj;
3253
3254 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3255 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3256 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3257 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3258 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3259 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3260 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3261 #ifdef __WXOSX__
3262                                     wxConvUTF8Obj;
3263 #else
3264                                     wxConvLibcObj;
3265 #endif
3266
3267
3268 #else // !wxUSE_WCHAR_T
3269
3270 // stand-ins in absence of wchar_t
3271 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3272                                 wxConvISO8859_1,
3273                                 wxConvLocal,
3274                                 wxConvUTF8;
3275
3276 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T