src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifndef WX_PRECOMP
  19     #ifdef __WXMSW__
  20         #include "wx/msw/missing.h"
  21     #endif
  22     #include "wx/intl.h"
  23     #include "wx/log.h"
  24     #include "wx/utils.h"
  25     #include "wx/hashmap.h"
  26 #endif
  27
  28 #include "wx/strconv.h"
  29
  30 #if wxUSE_WCHAR_T
  31
  32 #ifdef __WINDOWS__
  33     #include "wx/msw/private.h"
  34 #endif
  35
  36 #ifndef __WXWINCE__
  37 #include <errno.h>
  38 #endif
  39
  40 #include <ctype.h>
  41 #include <string.h>
  42 #include <stdlib.h>
  43
  44 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  45     #define wxHAVE_WIN32_MB2WC
  46 #endif
  47
  48 #ifdef __SALFORDC__
  49     #include <clib.h>
  50 #endif
  51
  52 #ifdef HAVE_ICONV
  53     #include <iconv.h>
  54     #include "wx/thread.h"
  55 #endif
  56
  57 #include "wx/encconv.h"
  58 #include "wx/fontmap.h"
  59
  60 #ifdef __WXMAC__
  61 #ifndef __DARWIN__
  62 #include <ATSUnicode.h>
  63 #include <TextCommon.h>
  64 #include <TextEncodingConverter.h>
  65 #endif
  66
  67 // includes Mac headers
  68 #include "wx/mac/private.h"
  69 #endif
  70
  71
  72 #define TRACE_STRCONV _T("strconv")
  73
  74 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  75 // be 4 bytes
  76 #if SIZEOF_WCHAR_T == 2
  77     #define WC_UTF16
  78 #endif
  79
  80
  81 // ============================================================================
  82 // implementation
  83 // ============================================================================
  84
  85 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  86 static bool NotAllNULs(const char *p, size_t n)
  87 {
  88     while ( n && *p++ == '\0' )
  89         n--;
  90
  91     return n != 0;
  92 }
  93
  94 // ----------------------------------------------------------------------------
  95 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  96 // ----------------------------------------------------------------------------
  97
  98 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  99 {
 100     if (input <= 0xffff)
 101     {
 102         if (output)
 103             *output = (wxUint16) input;
 104
 105         return 1;
 106     }
 107     else if (input >= 0x110000)
 108     {
 109         return wxCONV_FAILED;
 110     }
 111     else
 112     {
 113         if (output)
 114         {
 115             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 116             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 117         }
 118
 119         return 2;
 120     }
 121 }
 122
 123 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 124 {
 125     if ((*input < 0xd800) || (*input > 0xdfff))
 126     {
 127         output = *input;
 128         return 1;
 129     }
 130     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 131     {
 132         output = *input;
 133         return wxCONV_FAILED;
 134     }
 135     else
 136     {
 137         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 138         return 2;
 139     }
 140 }
 141
 142 #ifdef WC_UTF16
 143     typedef wchar_t wxDecodeSurrogate_t;
 144 #else // !WC_UTF16
 145     typedef wxUint16 wxDecodeSurrogate_t;
 146 #endif // WC_UTF16/!WC_UTF16
 147
 148 // returns the next UTF-32 character from the wchar_t buffer and advances the
 149 // pointer to the character after this one
 150 //
 151 // if an invalid character is found, *pSrc is set to NULL, the caller must
 152 // check for this
 153 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 154 {
 155     wxUint32 out;
 156     const size_t
 157         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 158     if ( n == wxCONV_FAILED )
 159         *pSrc = NULL;
 160     else
 161         *pSrc += n;
 162
 163     return out;
 164 }
 165
 166 // ----------------------------------------------------------------------------
 167 // wxMBConv
 168 // ----------------------------------------------------------------------------
 169
 170 size_t
 171 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 172                   const char *src, size_t srcLen) const
 173 {
 174     // although new conversion classes are supposed to implement this function
 175     // directly, the existins ones only implement the old MB2WC() and so, to
 176     // avoid to have to rewrite all conversion classes at once, we provide a
 177     // default (but not efficient) implementation of this one in terms of the
 178     // old function by copying the input to ensure that it's NUL-terminated and
 179     // then using MB2WC() to convert it
 180
 181     // the number of chars [which would be] written to dst [if it were not NULL]
 182     size_t dstWritten = 0;
 183
 184     // the number of NULs terminating this string
 185     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 186
 187     // if we were not given the input size we just have to assume that the
 188     // string is properly terminated as we have no way of knowing how long it
 189     // is anyhow, but if we do have the size check whether there are enough
 190     // NULs at the end
 191     wxCharBuffer bufTmp;
 192     const char *srcEnd;
 193     if ( srcLen != wxNO_LEN )
 194     {
 195         // we need to know how to find the end of this string
 196         nulLen = GetMBNulLen();
 197         if ( nulLen == wxCONV_FAILED )
 198             return wxCONV_FAILED;
 199
 200         // if there are enough NULs we can avoid the copy
 201         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 202         {
 203             // make a copy in order to properly NUL-terminate the string
 204             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 205             char * const p = bufTmp.data();
 206             memcpy(p, src, srcLen);
 207             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 208                 *s = '\0';
 209
 210             src = bufTmp;
 211         }
 212
 213         srcEnd = src + srcLen;
 214     }
 215     else // quit after the first loop iteration
 216     {
 217         srcEnd = NULL;
 218     }
 219
 220     for ( ;; )
 221     {
 222         // try to convert the current chunk
 223         size_t lenChunk = MB2WC(NULL, src, 0);
 224         if ( lenChunk == wxCONV_FAILED )
 225             return wxCONV_FAILED;
 226
 227         lenChunk++; // for the L'\0' at the end of this chunk
 228
 229         dstWritten += lenChunk;
 230
 231         if ( lenChunk == 1 )
 232         {
 233             // nothing left in the input string, conversion succeeded
 234             break;
 235         }
 236
 237         if ( dst )
 238         {
 239             if ( dstWritten > dstLen )
 240                 return wxCONV_FAILED;
 241
 242             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 243                 return wxCONV_FAILED;
 244
 245             dst += lenChunk;
 246         }
 247
 248         if ( !srcEnd )
 249         {
 250             // we convert just one chunk in this case as this is the entire
 251             // string anyhow
 252             break;
 253         }
 254
 255         // advance the input pointer past the end of this chunk
 256         while ( NotAllNULs(src, nulLen) )
 257         {
 258             // notice that we must skip over multiple bytes here as we suppose
 259             // that if NUL takes 2 or 4 bytes, then all the other characters do
 260             // too and so if advanced by a single byte we might erroneously
 261             // detect sequences of NUL bytes in the middle of the input
 262             src += nulLen;
 263         }
 264
 265         src += nulLen; // skipping over its terminator as well
 266
 267         // note that ">=" (and not just "==") is needed here as the terminator
 268         // we skipped just above could be inside or just after the buffer
 269         // delimited by inEnd
 270         if ( src >= srcEnd )
 271             break;
 272     }
 273
 274     return dstWritten;
 275 }
 276
 277 size_t
 278 wxMBConv::FromWChar(char *dst, size_t dstLen,
 279                     const wchar_t *src, size_t srcLen) const
 280 {
 281     // the number of chars [which would be] written to dst [if it were not NULL]
 282     size_t dstWritten = 0;
 283
 284     // make a copy of the input string unless it is already properly
 285     // NUL-terminated
 286     //
 287     // if we don't know its length we have no choice but to assume that it is,
 288     // indeed, properly terminated
 289     wxWCharBuffer bufTmp;
 290     if ( srcLen == wxNO_LEN )
 291     {
 292         srcLen = wxWcslen(src) + 1;
 293     }
 294     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 295     {
 296         // make a copy in order to properly NUL-terminate the string
 297         bufTmp = wxWCharBuffer(srcLen);
 298         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 299         src = bufTmp;
 300     }
 301
 302     const size_t lenNul = GetMBNulLen();
 303     for ( const wchar_t * const srcEnd = src + srcLen;
 304           src < srcEnd;
 305           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 306     {
 307         // try to convert the current chunk
 308         size_t lenChunk = WC2MB(NULL, src, 0);
 309
 310         if ( lenChunk == wxCONV_FAILED )
 311             return wxCONV_FAILED;
 312
 313         lenChunk += lenNul;
 314         dstWritten += lenChunk;
 315
 316         if ( dst )
 317         {
 318             if ( dstWritten > dstLen )
 319                 return wxCONV_FAILED;
 320
 321             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 322                 return wxCONV_FAILED;
 323
 324             dst += lenChunk;
 325         }
 326     }
 327
 328     return dstWritten;
 329 }
 330
 331 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 332 {
 333     size_t rc = ToWChar(outBuff, outLen, inBuff);
 334     if ( rc != wxCONV_FAILED )
 335     {
 336         // ToWChar() returns the buffer length, i.e. including the trailing
 337         // NUL, while this method doesn't take it into account
 338         rc--;
 339     }
 340
 341     return rc;
 342 }
 343
 344 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 345 {
 346     size_t rc = FromWChar(outBuff, outLen, inBuff);
 347     if ( rc != wxCONV_FAILED )
 348     {
 349         rc -= GetMBNulLen();
 350     }
 351
 352     return rc;
 353 }
 354
 355 wxMBConv::~wxMBConv()
 356 {
 357     // nothing to do here (necessary for Darwin linking probably)
 358 }
 359
 360 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 361 {
 362     if ( psz )
 363     {
 364         // calculate the length of the buffer needed first
 365         const size_t nLen = MB2WC(NULL, psz, 0);
 366         if ( nLen != wxCONV_FAILED )
 367         {
 368             // now do the actual conversion
 369             wxWCharBuffer buf(nLen /* +1 added implicitly */);
 370
 371             // +1 for the trailing NULL
 372             if ( MB2WC(buf.data(), psz, nLen + 1) != wxCONV_FAILED )
 373                 return buf;
 374         }
 375     }
 376
 377     return wxWCharBuffer();
 378 }
 379
 380 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 381 {
 382     if ( pwz )
 383     {
 384         const size_t nLen = WC2MB(NULL, pwz, 0);
 385         if ( nLen != wxCONV_FAILED )
 386         {
 387             // extra space for trailing NUL(s)
 388             static const size_t extraLen = GetMaxMBNulLen();
 389
 390             wxCharBuffer buf(nLen + extraLen - 1);
 391             if ( WC2MB(buf.data(), pwz, nLen + extraLen) != wxCONV_FAILED )
 392                 return buf;
 393         }
 394     }
 395
 396     return wxCharBuffer();
 397 }
 398
 399 const wxWCharBuffer
 400 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 401 {
 402     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 403     if ( dstLen != wxCONV_FAILED )
 404     {
 405         wxWCharBuffer wbuf(dstLen - 1);
 406         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 407         {
 408             if ( outLen )
 409             {
 410                 *outLen = dstLen;
 411                 if ( wbuf[dstLen - 1] == L'\0' )
 412                     (*outLen)--;
 413             }
 414
 415             return wbuf;
 416         }
 417     }
 418
 419     if ( outLen )
 420         *outLen = 0;
 421
 422     return wxWCharBuffer();
 423 }
 424
 425 const wxCharBuffer
 426 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 427 {
 428     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 429     if ( dstLen != wxCONV_FAILED )
 430     {
 431         // special case of empty input: can't allocate 0 size buffer below as
 432         // wxCharBuffer insists on NUL-terminating it
 433         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 434         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 435         {
 436             if ( outLen )
 437             {
 438                 *outLen = dstLen;
 439
 440                 const size_t nulLen = GetMBNulLen();
 441                 if ( dstLen >= nulLen &&
 442                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 443                 {
 444                     // in this case the output is NUL-terminated and we're not
 445                     // supposed to count NUL
 446                     *outLen -= nulLen;
 447                 }
 448             }
 449
 450             return buf;
 451         }
 452     }
 453
 454     if ( outLen )
 455         *outLen = 0;
 456
 457     return wxCharBuffer();
 458 }
 459
 460 // ----------------------------------------------------------------------------
 461 // wxMBConvLibc
 462 // ----------------------------------------------------------------------------
 463
 464 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 465 {
 466     return wxMB2WC(buf, psz, n);
 467 }
 468
 469 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 470 {
 471     return wxWC2MB(buf, psz, n);
 472 }
 473
 474 // ----------------------------------------------------------------------------
 475 // wxConvBrokenFileNames
 476 // ----------------------------------------------------------------------------
 477
 478 #ifdef __UNIX__
 479
 480 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxChar *charset)
 481 {
 482     if ( !charset || wxStricmp(charset, _T("UTF-8")) == 0
 483                   || wxStricmp(charset, _T("UTF8")) == 0  )
 484         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL);
 485     else
 486         m_conv = new wxCSConv(charset);
 487 }
 488
 489 #endif // __UNIX__
 490
 491 // ----------------------------------------------------------------------------
 492 // UTF-7
 493 // ----------------------------------------------------------------------------
 494
 495 // Implementation (C) 2004 Fredrik Roubert
 496
 497 //
 498 // BASE64 decoding table
 499 //
 500 static const unsigned char utf7unb64[] =
 501 {
 502     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 504     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 506     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 508     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 509     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 511     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 512     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 513     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 515     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 516     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 517     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 525     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 526     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 527     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 528     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 529     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 530     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 531     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 532     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 533     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 534 };
 535
 536 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 537 {
 538     size_t len = 0;
 539
 540     while ( *psz && (!buf || (len < n)) )
 541     {
 542         unsigned char cc = *psz++;
 543         if (cc != '+')
 544         {
 545             // plain ASCII char
 546             if (buf)
 547                 *buf++ = cc;
 548             len++;
 549         }
 550         else if (*psz == '-')
 551         {
 552             // encoded plus sign
 553             if (buf)
 554                 *buf++ = cc;
 555             len++;
 556             psz++;
 557         }
 558         else // start of BASE64 encoded string
 559         {
 560             bool lsb, ok;
 561             unsigned int d, l;
 562             for ( ok = lsb = false, d = 0, l = 0;
 563                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 564                   psz++ )
 565             {
 566                 d <<= 6;
 567                 d += cc;
 568                 for (l += 6; l >= 8; lsb = !lsb)
 569                 {
 570                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 571                     if (lsb)
 572                     {
 573                         if (buf)
 574                             *buf++ |= c;
 575                         len ++;
 576                     }
 577                     else
 578                     {
 579                         if (buf)
 580                             *buf = (wchar_t)(c << 8);
 581                     }
 582
 583                     ok = true;
 584                 }
 585             }
 586
 587             if ( !ok )
 588             {
 589                 // in valid UTF7 we should have valid characters after '+'
 590                 return wxCONV_FAILED;
 591             }
 592
 593             if (*psz == '-')
 594                 psz++;
 595         }
 596     }
 597
 598     if ( buf && (len < n) )
 599         *buf = '\0';
 600
 601     return len;
 602 }
 603
 604 //
 605 // BASE64 encoding table
 606 //
 607 static const unsigned char utf7enb64[] =
 608 {
 609     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 610     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 611     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 612     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 613     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 614     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 615     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 616     '4', '5', '6', '7', '8', '9', '+', '/'
 617 };
 618
 619 //
 620 // UTF-7 encoding table
 621 //
 622 // 0 - Set D (directly encoded characters)
 623 // 1 - Set O (optional direct characters)
 624 // 2 - whitespace characters (optional)
 625 // 3 - special characters
 626 //
 627 static const unsigned char utf7encode[128] =
 628 {
 629     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 630     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 631     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 632     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 633     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 634     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 635     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 636     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 637 };
 638
 639 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 640 {
 641     size_t len = 0;
 642
 643     while (*psz && ((!buf) || (len < n)))
 644     {
 645         wchar_t cc = *psz++;
 646         if (cc < 0x80 && utf7encode[cc] < 1)
 647         {
 648             // plain ASCII char
 649             if (buf)
 650                 *buf++ = (char)cc;
 651
 652             len++;
 653         }
 654 #ifndef WC_UTF16
 655         else if (((wxUint32)cc) > 0xffff)
 656         {
 657             // no surrogate pair generation (yet?)
 658             return wxCONV_FAILED;
 659         }
 660 #endif
 661         else
 662         {
 663             if (buf)
 664                 *buf++ = '+';
 665
 666             len++;
 667             if (cc != '+')
 668             {
 669                 // BASE64 encode string
 670                 unsigned int lsb, d, l;
 671                 for (d = 0, l = 0; /*nothing*/; psz++)
 672                 {
 673                     for (lsb = 0; lsb < 2; lsb ++)
 674                     {
 675                         d <<= 8;
 676                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 677
 678                         for (l += 8; l >= 6; )
 679                         {
 680                             l -= 6;
 681                             if (buf)
 682                                 *buf++ = utf7enb64[(d >> l) % 64];
 683                             len++;
 684                         }
 685                     }
 686
 687                     cc = *psz;
 688                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 689                         break;
 690                 }
 691
 692                 if (l != 0)
 693                 {
 694                     if (buf)
 695                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 696
 697                     len++;
 698                 }
 699             }
 700
 701             if (buf)
 702                 *buf++ = '-';
 703             len++;
 704         }
 705     }
 706
 707     if (buf && (len < n))
 708         *buf = 0;
 709
 710     return len;
 711 }
 712
 713 // ----------------------------------------------------------------------------
 714 // UTF-8
 715 // ----------------------------------------------------------------------------
 716
 717 static wxUint32 utf8_max[]=
 718     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 719
 720 // boundaries of the private use area we use to (temporarily) remap invalid
 721 // characters invalid in a UTF-8 encoded string
 722 const wxUint32 wxUnicodePUA = 0x100000;
 723 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 724
 725 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 726 {
 727     size_t len = 0;
 728
 729     while (*psz && ((!buf) || (len < n)))
 730     {
 731         const char *opsz = psz;
 732         bool invalid = false;
 733         unsigned char cc = *psz++, fc = cc;
 734         unsigned cnt;
 735         for (cnt = 0; fc & 0x80; cnt++)
 736             fc <<= 1;
 737
 738         if (!cnt)
 739         {
 740             // plain ASCII char
 741             if (buf)
 742                 *buf++ = cc;
 743             len++;
 744
 745             // escape the escape character for octal escapes
 746             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
 747                     && cc == '\\' && (!buf || len < n))
 748             {
 749                 if (buf)
 750                     *buf++ = cc;
 751                 len++;
 752             }
 753         }
 754         else
 755         {
 756             cnt--;
 757             if (!cnt)
 758             {
 759                 // invalid UTF-8 sequence
 760                 invalid = true;
 761             }
 762             else
 763             {
 764                 unsigned ocnt = cnt - 1;
 765                 wxUint32 res = cc & (0x3f >> cnt);
 766                 while (cnt--)
 767                 {
 768                     cc = *psz;
 769                     if ((cc & 0xC0) != 0x80)
 770                     {
 771                         // invalid UTF-8 sequence
 772                         invalid = true;
 773                         break;
 774                     }
 775
 776                     psz++;
 777                     res = (res << 6) | (cc & 0x3f);
 778                 }
 779
 780                 if (invalid || res <= utf8_max[ocnt])
 781                 {
 782                     // illegal UTF-8 encoding
 783                     invalid = true;
 784                 }
 785                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
 786                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
 787                 {
 788                     // if one of our PUA characters turns up externally
 789                     // it must also be treated as an illegal sequence
 790                     // (a bit like you have to escape an escape character)
 791                     invalid = true;
 792                 }
 793                 else
 794                 {
 795 #ifdef WC_UTF16
 796                     // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 797                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
 798                     if (pa == wxCONV_FAILED)
 799                     {
 800                         invalid = true;
 801                     }
 802                     else
 803                     {
 804                         if (buf)
 805                             buf += pa;
 806                         len += pa;
 807                     }
 808 #else // !WC_UTF16
 809                     if (buf)
 810                         *buf++ = (wchar_t)res;
 811                     len++;
 812 #endif // WC_UTF16/!WC_UTF16
 813                 }
 814             }
 815
 816             if (invalid)
 817             {
 818                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
 819                 {
 820                     while (opsz < psz && (!buf || len < n))
 821                     {
 822 #ifdef WC_UTF16
 823                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 824                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
 825                         wxASSERT(pa != wxCONV_FAILED);
 826                         if (buf)
 827                             buf += pa;
 828                         opsz++;
 829                         len += pa;
 830 #else
 831                         if (buf)
 832                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
 833                         opsz++;
 834                         len++;
 835 #endif
 836                     }
 837                 }
 838                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 839                 {
 840                     while (opsz < psz && (!buf || len < n))
 841                     {
 842                         if ( buf && len + 3 < n )
 843                         {
 844                             unsigned char on = *opsz;
 845                             *buf++ = L'\\';
 846                             *buf++ = (wchar_t)( L'0' + on / 0100 );
 847                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
 848                             *buf++ = (wchar_t)( L'0' + on % 010 );
 849                         }
 850
 851                         opsz++;
 852                         len += 4;
 853                     }
 854                 }
 855                 else // MAP_INVALID_UTF8_NOT
 856                 {
 857                     return wxCONV_FAILED;
 858                 }
 859             }
 860         }
 861     }
 862
 863     if (buf && (len < n))
 864         *buf = 0;
 865
 866     return len;
 867 }
 868
 869 static inline bool isoctal(wchar_t wch)
 870 {
 871     return L'0' <= wch && wch <= L'7';
 872 }
 873
 874 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 875 {
 876     size_t len = 0;
 877
 878     while (*psz && ((!buf) || (len < n)))
 879     {
 880         wxUint32 cc;
 881
 882 #ifdef WC_UTF16
 883         // cast is ok for WC_UTF16
 884         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 885         psz += (pa == wxCONV_FAILED) ? 1 : pa;
 886 #else
 887         cc = (*psz++) & 0x7fffffff;
 888 #endif
 889
 890         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
 891                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
 892         {
 893             if (buf)
 894                 *buf++ = (char)(cc - wxUnicodePUA);
 895             len++;
 896         }
 897         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
 898                     && cc == L'\\' && psz[0] == L'\\' )
 899         {
 900             if (buf)
 901                 *buf++ = (char)cc;
 902             psz++;
 903             len++;
 904         }
 905         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
 906                     cc == L'\\' &&
 907                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
 908         {
 909             if (buf)
 910             {
 911                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
 912                                  (psz[1] - L'0') * 010 +
 913                                  (psz[2] - L'0'));
 914             }
 915
 916             psz += 3;
 917             len++;
 918         }
 919         else
 920         {
 921             unsigned cnt;
 922             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
 923             {
 924             }
 925
 926             if (!cnt)
 927             {
 928                 // plain ASCII char
 929                 if (buf)
 930                     *buf++ = (char) cc;
 931                 len++;
 932             }
 933             else
 934             {
 935                 len += cnt + 1;
 936                 if (buf)
 937                 {
 938                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 939                     while (cnt--)
 940                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 941                 }
 942             }
 943         }
 944     }
 945
 946     if (buf && (len < n))
 947         *buf = 0;
 948
 949     return len;
 950 }
 951
 952 // ============================================================================
 953 // UTF-16
 954 // ============================================================================
 955
 956 #ifdef WORDS_BIGENDIAN
 957     #define wxMBConvUTF16straight wxMBConvUTF16BE
 958     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 959 #else
 960     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 961     #define wxMBConvUTF16straight wxMBConvUTF16LE
 962 #endif
 963
 964 /* static */
 965 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
 966 {
 967     if ( srcLen == wxNO_LEN )
 968     {
 969         // count the number of bytes in input, including the trailing NULs
 970         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
 971         for ( srcLen = 1; *inBuff++; srcLen++ )
 972             ;
 973
 974         srcLen *= BYTES_PER_CHAR;
 975     }
 976     else // we already have the length
 977     {
 978         // we can only convert an entire number of UTF-16 characters
 979         if ( srcLen % BYTES_PER_CHAR )
 980             return wxCONV_FAILED;
 981     }
 982
 983     return srcLen;
 984 }
 985
 986 // case when in-memory representation is UTF-16 too
 987 #ifdef WC_UTF16
 988
 989 // ----------------------------------------------------------------------------
 990 // conversions without endianness change
 991 // ----------------------------------------------------------------------------
 992
 993 size_t
 994 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
 995                                const char *src, size_t srcLen) const
 996 {
 997     // set up the scene for using memcpy() (which is presumably more efficient
 998     // than copying the bytes one by one)
 999     srcLen = GetLength(src, srcLen);
1000     if ( srcLen == wxNO_LEN )
1001         return wxCONV_FAILED;
1002
1003     const size_t inLen = srcLen / BYTES_PER_CHAR;
1004     if ( dst )
1005     {
1006         if ( dstLen < inLen )
1007             return wxCONV_FAILED;
1008
1009         memcpy(dst, src, srcLen);
1010     }
1011
1012     return inLen;
1013 }
1014
1015 size_t
1016 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1017                                  const wchar_t *src, size_t srcLen) const
1018 {
1019     if ( srcLen == wxNO_LEN )
1020         srcLen = wxWcslen(src) + 1;
1021
1022     srcLen *= BYTES_PER_CHAR;
1023
1024     if ( dst )
1025     {
1026         if ( dstLen < srcLen )
1027             return wxCONV_FAILED;
1028
1029         memcpy(dst, src, srcLen);
1030     }
1031
1032     return srcLen;
1033 }
1034
1035 // ----------------------------------------------------------------------------
1036 // endian-reversing conversions
1037 // ----------------------------------------------------------------------------
1038
1039 size_t
1040 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1041                            const char *src, size_t srcLen) const
1042 {
1043     srcLen = GetLength(src, srcLen);
1044     if ( srcLen == wxNO_LEN )
1045         return wxCONV_FAILED;
1046
1047     srcLen /= BYTES_PER_CHAR;
1048
1049     if ( dst )
1050     {
1051         if ( dstLen < srcLen )
1052             return wxCONV_FAILED;
1053
1054         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1055         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1056         {
1057             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1058         }
1059     }
1060
1061     return srcLen;
1062 }
1063
1064 size_t
1065 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1066                              const wchar_t *src, size_t srcLen) const
1067 {
1068     if ( srcLen == wxNO_LEN )
1069         srcLen = wxWcslen(src) + 1;
1070
1071     srcLen *= BYTES_PER_CHAR;
1072
1073     if ( dst )
1074     {
1075         if ( dstLen < srcLen )
1076             return wxCONV_FAILED;
1077
1078         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1079         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1080         {
1081             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1082         }
1083     }
1084
1085     return srcLen;
1086 }
1087
1088 #else // !WC_UTF16: wchar_t is UTF-32
1089
1090 // ----------------------------------------------------------------------------
1091 // conversions without endianness change
1092 // ----------------------------------------------------------------------------
1093
1094 size_t
1095 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1096                                const char *src, size_t srcLen) const
1097 {
1098     srcLen = GetLength(src, srcLen);
1099     if ( srcLen == wxNO_LEN )
1100         return wxCONV_FAILED;
1101
1102     const size_t inLen = srcLen / BYTES_PER_CHAR;
1103     if ( !dst )
1104     {
1105         // optimization: return maximal space which could be needed for this
1106         // string even if the real size could be smaller if the buffer contains
1107         // any surrogates
1108         return inLen;
1109     }
1110
1111     size_t outLen = 0;
1112     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1113     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1114     {
1115         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1116         if ( !inBuff )
1117             return wxCONV_FAILED;
1118
1119         if ( ++outLen > dstLen )
1120             return wxCONV_FAILED;
1121
1122         *dst++ = ch;
1123     }
1124
1125
1126     return outLen;
1127 }
1128
1129 size_t
1130 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1131                                  const wchar_t *src, size_t srcLen) const
1132 {
1133     if ( srcLen == wxNO_LEN )
1134         srcLen = wxWcslen(src) + 1;
1135
1136     size_t outLen = 0;
1137     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1138     for ( size_t n = 0; n < srcLen; n++ )
1139     {
1140         wxUint16 cc[2];
1141         const size_t numChars = encode_utf16(*src++, cc);
1142         if ( numChars == wxCONV_FAILED )
1143             return wxCONV_FAILED;
1144
1145         outLen += numChars * BYTES_PER_CHAR;
1146         if ( outBuff )
1147         {
1148             if ( outLen > dstLen )
1149                 return wxCONV_FAILED;
1150
1151             *outBuff++ = cc[0];
1152             if ( numChars == 2 )
1153             {
1154                 // second character of a surrogate
1155                 *outBuff++ = cc[1];
1156             }
1157         }
1158     }
1159
1160     return outLen;
1161 }
1162
1163 // ----------------------------------------------------------------------------
1164 // endian-reversing conversions
1165 // ----------------------------------------------------------------------------
1166
1167 size_t
1168 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1169                            const char *src, size_t srcLen) const
1170 {
1171     srcLen = GetLength(src, srcLen);
1172     if ( srcLen == wxNO_LEN )
1173         return wxCONV_FAILED;
1174
1175     const size_t inLen = srcLen / BYTES_PER_CHAR;
1176     if ( !dst )
1177     {
1178         // optimization: return maximal space which could be needed for this
1179         // string even if the real size could be smaller if the buffer contains
1180         // any surrogates
1181         return inLen;
1182     }
1183
1184     size_t outLen = 0;
1185     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1186     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1187     {
1188         wxUint32 ch;
1189         wxUint16 tmp[2];
1190
1191         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1192         inBuff++;
1193         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1194
1195         const size_t numChars = decode_utf16(tmp, ch);
1196         if ( numChars == wxCONV_FAILED )
1197             return wxCONV_FAILED;
1198
1199         if ( numChars == 2 )
1200             inBuff++;
1201
1202         if ( ++outLen > dstLen )
1203             return wxCONV_FAILED;
1204
1205         *dst++ = ch;
1206     }
1207
1208
1209     return outLen;
1210 }
1211
1212 size_t
1213 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1214                              const wchar_t *src, size_t srcLen) const
1215 {
1216     if ( srcLen == wxNO_LEN )
1217         srcLen = wxWcslen(src) + 1;
1218
1219     size_t outLen = 0;
1220     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1221     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1222     {
1223         wxUint16 cc[2];
1224         const size_t numChars = encode_utf16(*src, cc);
1225         if ( numChars == wxCONV_FAILED )
1226             return wxCONV_FAILED;
1227
1228         outLen += numChars * BYTES_PER_CHAR;
1229         if ( outBuff )
1230         {
1231             if ( outLen > dstLen )
1232                 return wxCONV_FAILED;
1233
1234             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1235             if ( numChars == 2 )
1236             {
1237                 // second character of a surrogate
1238                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1239             }
1240         }
1241     }
1242
1243     return outLen;
1244 }
1245
1246 #endif // WC_UTF16/!WC_UTF16
1247
1248
1249 // ============================================================================
1250 // UTF-32
1251 // ============================================================================
1252
1253 #ifdef WORDS_BIGENDIAN
1254     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1255     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1256 #else
1257     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1258     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1259 #endif
1260
1261
1262 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1263 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1264
1265 /* static */
1266 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1267 {
1268     if ( srcLen == wxNO_LEN )
1269     {
1270         // count the number of bytes in input, including the trailing NULs
1271         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1272         for ( srcLen = 1; *inBuff++; srcLen++ )
1273             ;
1274
1275         srcLen *= BYTES_PER_CHAR;
1276     }
1277     else // we already have the length
1278     {
1279         // we can only convert an entire number of UTF-32 characters
1280         if ( srcLen % BYTES_PER_CHAR )
1281             return wxCONV_FAILED;
1282     }
1283
1284     return srcLen;
1285 }
1286
1287 // case when in-memory representation is UTF-16
1288 #ifdef WC_UTF16
1289
1290 // ----------------------------------------------------------------------------
1291 // conversions without endianness change
1292 // ----------------------------------------------------------------------------
1293
1294 size_t
1295 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1296                                const char *src, size_t srcLen) const
1297 {
1298     srcLen = GetLength(src, srcLen);
1299     if ( srcLen == wxNO_LEN )
1300         return wxCONV_FAILED;
1301
1302     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1303     const size_t inLen = srcLen / BYTES_PER_CHAR;
1304     size_t outLen = 0;
1305     for ( size_t n = 0; n < inLen; n++ )
1306     {
1307         wxUint16 cc[2];
1308         const size_t numChars = encode_utf16(*inBuff++, cc);
1309         if ( numChars == wxCONV_FAILED )
1310             return wxCONV_FAILED;
1311
1312         outLen += numChars;
1313         if ( dst )
1314         {
1315             if ( outLen > dstLen )
1316                 return wxCONV_FAILED;
1317
1318             *dst++ = cc[0];
1319             if ( numChars == 2 )
1320             {
1321                 // second character of a surrogate
1322                 *dst++ = cc[1];
1323             }
1324         }
1325     }
1326
1327     return outLen;
1328 }
1329
1330 size_t
1331 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1332                                  const wchar_t *src, size_t srcLen) const
1333 {
1334     if ( srcLen == wxNO_LEN )
1335         srcLen = wxWcslen(src) + 1;
1336
1337     if ( !dst )
1338     {
1339         // optimization: return maximal space which could be needed for this
1340         // string instead of the exact amount which could be less if there are
1341         // any surrogates in the input
1342         //
1343         // we consider that surrogates are rare enough to make it worthwhile to
1344         // avoid running the loop below at the cost of slightly extra memory
1345         // consumption
1346         return srcLen * BYTES_PER_CHAR;
1347     }
1348
1349     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1350     size_t outLen = 0;
1351     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1352     {
1353         const wxUint32 ch = wxDecodeSurrogate(&src);
1354         if ( !src )
1355             return wxCONV_FAILED;
1356
1357         outLen += BYTES_PER_CHAR;
1358
1359         if ( outLen > dstLen )
1360             return wxCONV_FAILED;
1361
1362         *outBuff++ = ch;
1363     }
1364
1365     return outLen;
1366 }
1367
1368 // ----------------------------------------------------------------------------
1369 // endian-reversing conversions
1370 // ----------------------------------------------------------------------------
1371
1372 size_t
1373 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1374                            const char *src, size_t srcLen) const
1375 {
1376     srcLen = GetLength(src, srcLen);
1377     if ( srcLen == wxNO_LEN )
1378         return wxCONV_FAILED;
1379
1380     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1381     const size_t inLen = srcLen / BYTES_PER_CHAR;
1382     size_t outLen = 0;
1383     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1384     {
1385         wxUint16 cc[2];
1386         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1387         if ( numChars == wxCONV_FAILED )
1388             return wxCONV_FAILED;
1389
1390         outLen += numChars;
1391         if ( dst )
1392         {
1393             if ( outLen > dstLen )
1394                 return wxCONV_FAILED;
1395
1396             *dst++ = cc[0];
1397             if ( numChars == 2 )
1398             {
1399                 // second character of a surrogate
1400                 *dst++ = cc[1];
1401             }
1402         }
1403     }
1404
1405     return outLen;
1406 }
1407
1408 size_t
1409 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1410                              const wchar_t *src, size_t srcLen) const
1411 {
1412     if ( srcLen == wxNO_LEN )
1413         srcLen = wxWcslen(src) + 1;
1414
1415     if ( !dst )
1416     {
1417         // optimization: return maximal space which could be needed for this
1418         // string instead of the exact amount which could be less if there are
1419         // any surrogates in the input
1420         //
1421         // we consider that surrogates are rare enough to make it worthwhile to
1422         // avoid running the loop below at the cost of slightly extra memory
1423         // consumption
1424         return srcLen*BYTES_PER_CHAR;
1425     }
1426
1427     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1428     size_t outLen = 0;
1429     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1430     {
1431         const wxUint32 ch = wxDecodeSurrogate(&src);
1432         if ( !src )
1433             return wxCONV_FAILED;
1434
1435         outLen += BYTES_PER_CHAR;
1436
1437         if ( outLen > dstLen )
1438             return wxCONV_FAILED;
1439
1440         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1441     }
1442
1443     return outLen;
1444 }
1445
1446 #else // !WC_UTF16: wchar_t is UTF-32
1447
1448 // ----------------------------------------------------------------------------
1449 // conversions without endianness change
1450 // ----------------------------------------------------------------------------
1451
1452 size_t
1453 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1454                                const char *src, size_t srcLen) const
1455 {
1456     // use memcpy() as it should be much faster than hand-written loop
1457     srcLen = GetLength(src, srcLen);
1458     if ( srcLen == wxNO_LEN )
1459         return wxCONV_FAILED;
1460
1461     const size_t inLen = srcLen/BYTES_PER_CHAR;
1462     if ( dst )
1463     {
1464         if ( dstLen < inLen )
1465             return wxCONV_FAILED;
1466
1467         memcpy(dst, src, srcLen);
1468     }
1469
1470     return inLen;
1471 }
1472
1473 size_t
1474 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1475                                  const wchar_t *src, size_t srcLen) const
1476 {
1477     if ( srcLen == wxNO_LEN )
1478         srcLen = wxWcslen(src) + 1;
1479
1480     srcLen *= BYTES_PER_CHAR;
1481
1482     if ( dst )
1483     {
1484         if ( dstLen < srcLen )
1485             return wxCONV_FAILED;
1486
1487         memcpy(dst, src, srcLen);
1488     }
1489
1490     return srcLen;
1491 }
1492
1493 // ----------------------------------------------------------------------------
1494 // endian-reversing conversions
1495 // ----------------------------------------------------------------------------
1496
1497 size_t
1498 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1499                            const char *src, size_t srcLen) const
1500 {
1501     srcLen = GetLength(src, srcLen);
1502     if ( srcLen == wxNO_LEN )
1503         return wxCONV_FAILED;
1504
1505     srcLen /= BYTES_PER_CHAR;
1506
1507     if ( dst )
1508     {
1509         if ( dstLen < srcLen )
1510             return wxCONV_FAILED;
1511
1512         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1513         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1514         {
1515             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1516         }
1517     }
1518
1519     return srcLen;
1520 }
1521
1522 size_t
1523 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1524                              const wchar_t *src, size_t srcLen) const
1525 {
1526     if ( srcLen == wxNO_LEN )
1527         srcLen = wxWcslen(src) + 1;
1528
1529     srcLen *= BYTES_PER_CHAR;
1530
1531     if ( dst )
1532     {
1533         if ( dstLen < srcLen )
1534             return wxCONV_FAILED;
1535
1536         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1537         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1538         {
1539             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1540         }
1541     }
1542
1543     return srcLen;
1544 }
1545
1546 #endif // WC_UTF16/!WC_UTF16
1547
1548
1549 // ============================================================================
1550 // The classes doing conversion using the iconv_xxx() functions
1551 // ============================================================================
1552
1553 #ifdef HAVE_ICONV
1554
1555 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1556 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1557 //     (unless there's yet another bug in glibc) the only case when iconv()
1558 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1559 //     left in the input buffer -- when _real_ error occurs,
1560 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1561 //     iconv() failure.
1562 //     [This bug does not appear in glibc 2.2.]
1563 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1564 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1565                                      (errno != E2BIG || bufLeft != 0))
1566 #else
1567 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1568 #endif
1569
1570 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1571
1572 #define ICONV_T_INVALID ((iconv_t)-1)
1573
1574 #if SIZEOF_WCHAR_T == 4
1575     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1576     #define WC_ENC      wxFONTENCODING_UTF32
1577 #elif SIZEOF_WCHAR_T == 2
1578     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1579     #define WC_ENC      wxFONTENCODING_UTF16
1580 #else // sizeof(wchar_t) != 2 nor 4
1581     // does this ever happen?
1582     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1583 #endif
1584
1585 // ----------------------------------------------------------------------------
1586 // wxMBConv_iconv: encapsulates an iconv character set
1587 // ----------------------------------------------------------------------------
1588
1589 class wxMBConv_iconv : public wxMBConv
1590 {
1591 public:
1592     wxMBConv_iconv(const wxChar *name);
1593     virtual ~wxMBConv_iconv();
1594
1595     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1596     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1597
1598     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1599     virtual size_t GetMBNulLen() const;
1600
1601     virtual wxMBConv *Clone() const
1602     {
1603         wxMBConv_iconv *p = new wxMBConv_iconv(m_name);
1604         p->m_minMBCharWidth = m_minMBCharWidth;
1605         return p;
1606     }
1607
1608     bool IsOk() const
1609         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1610
1611 protected:
1612     // the iconv handlers used to translate from multibyte
1613     // to wide char and in the other direction
1614     iconv_t m2w,
1615             w2m;
1616
1617 #if wxUSE_THREADS
1618     // guards access to m2w and w2m objects
1619     wxMutex m_iconvMutex;
1620 #endif
1621
1622 private:
1623     // the name (for iconv_open()) of a wide char charset -- if none is
1624     // available on this machine, it will remain NULL
1625     static wxString ms_wcCharsetName;
1626
1627     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1628     // different endian-ness than the native one
1629     static bool ms_wcNeedsSwap;
1630
1631
1632     // name of the encoding handled by this conversion
1633     wxString m_name;
1634
1635     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1636     // initially
1637     size_t m_minMBCharWidth;
1638 };
1639
1640 // make the constructor available for unit testing
1641 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const wxChar* name )
1642 {
1643     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1644     if ( !result->IsOk() )
1645     {
1646         delete result;
1647         return 0;
1648     }
1649
1650     return result;
1651 }
1652
1653 wxString wxMBConv_iconv::ms_wcCharsetName;
1654 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1655
1656 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1657               : m_name(name)
1658 {
1659     m_minMBCharWidth = 0;
1660
1661     // iconv operates with chars, not wxChars, but luckily it uses only ASCII
1662     // names for the charsets
1663     const wxCharBuffer cname(wxString(name).ToAscii());
1664
1665     // check for charset that represents wchar_t:
1666     if ( ms_wcCharsetName.empty() )
1667     {
1668         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1669
1670 #if wxUSE_FONTMAP
1671         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1672 #else // !wxUSE_FONTMAP
1673         static const wxChar *names_static[] =
1674         {
1675 #if SIZEOF_WCHAR_T == 4
1676             _T("UCS-4"),
1677 #elif SIZEOF_WCHAR_T = 2
1678             _T("UCS-2"),
1679 #endif
1680             NULL
1681         };
1682         const wxChar **names = names_static;
1683 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1684
1685         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1686         {
1687             const wxString nameCS(*names);
1688
1689             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1690             wxString nameXE(nameCS);
1691
1692 #ifdef WORDS_BIGENDIAN
1693                 nameXE += _T("BE");
1694 #else // little endian
1695                 nameXE += _T("LE");
1696 #endif
1697
1698             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1699                        nameXE.c_str());
1700
1701             m2w = iconv_open(nameXE.ToAscii(), cname);
1702             if ( m2w == ICONV_T_INVALID )
1703             {
1704                 // try charset w/o bytesex info (e.g. "UCS4")
1705                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1706                            nameCS.c_str());
1707                 m2w = iconv_open(nameCS.ToAscii(), cname);
1708
1709                 // and check for bytesex ourselves:
1710                 if ( m2w != ICONV_T_INVALID )
1711                 {
1712                     char    buf[2], *bufPtr;
1713                     wchar_t wbuf[2], *wbufPtr;
1714                     size_t  insz, outsz;
1715                     size_t  res;
1716
1717                     buf[0] = 'A';
1718                     buf[1] = 0;
1719                     wbuf[0] = 0;
1720                     insz = 2;
1721                     outsz = SIZEOF_WCHAR_T * 2;
1722                     wbufPtr = wbuf;
1723                     bufPtr = buf;
1724
1725                     res = iconv(
1726                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1727                         (char**)&wbufPtr, &outsz);
1728
1729                     if (ICONV_FAILED(res, insz))
1730                     {
1731                         wxLogLastError(wxT("iconv"));
1732                         wxLogError(_("Conversion to charset '%s' doesn't work."),
1733                                    nameCS.c_str());
1734                     }
1735                     else // ok, can convert to this encoding, remember it
1736                     {
1737                         ms_wcCharsetName = nameCS;
1738                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1739                     }
1740                 }
1741             }
1742             else // use charset not requiring byte swapping
1743             {
1744                 ms_wcCharsetName = nameXE;
1745             }
1746         }
1747
1748         wxLogTrace(TRACE_STRCONV,
1749                    wxT("iconv wchar_t charset is \"%s\"%s"),
1750                    ms_wcCharsetName.empty() ? _T("<none>")
1751                                             : ms_wcCharsetName.c_str(),
1752                    ms_wcNeedsSwap ? _T(" (needs swap)")
1753                                   : _T(""));
1754     }
1755     else // we already have ms_wcCharsetName
1756     {
1757         m2w = iconv_open(ms_wcCharsetName.ToAscii(), cname);
1758     }
1759
1760     if ( ms_wcCharsetName.empty() )
1761     {
1762         w2m = ICONV_T_INVALID;
1763     }
1764     else
1765     {
1766         w2m = iconv_open(cname, ms_wcCharsetName.ToAscii());
1767         if ( w2m == ICONV_T_INVALID )
1768         {
1769             wxLogTrace(TRACE_STRCONV,
1770                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
1771                        ms_wcCharsetName.c_str(), cname.data());
1772         }
1773     }
1774 }
1775
1776 wxMBConv_iconv::~wxMBConv_iconv()
1777 {
1778     if ( m2w != ICONV_T_INVALID )
1779         iconv_close(m2w);
1780     if ( w2m != ICONV_T_INVALID )
1781         iconv_close(w2m);
1782 }
1783
1784 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1785 {
1786     // find the string length: notice that must be done differently for
1787     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
1788     size_t inbuf;
1789     const size_t nulLen = GetMBNulLen();
1790     switch ( nulLen )
1791     {
1792         default:
1793             return wxCONV_FAILED;
1794
1795         case 1:
1796             inbuf = strlen(psz); // arguably more optimized than our version
1797             break;
1798
1799         case 2:
1800         case 4:
1801             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
1802             // they also have to start at character boundary and not span two
1803             // adjacent characters
1804             const char *p;
1805             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
1806                 ;
1807             inbuf = p - psz;
1808             break;
1809     }
1810
1811 #if wxUSE_THREADS
1812     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
1813     //     Unfortunately there are a couple of global wxCSConv objects such as
1814     //     wxConvLocal that are used all over wx code, so we have to make sure
1815     //     the handle is used by at most one thread at the time. Otherwise
1816     //     only a few wx classes would be safe to use from non-main threads
1817     //     as MB<->WC conversion would fail "randomly".
1818     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1819 #endif // wxUSE_THREADS
1820
1821     size_t outbuf = n * SIZEOF_WCHAR_T;
1822     size_t res, cres;
1823     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1824     wchar_t *bufPtr = buf;
1825     const char *pszPtr = psz;
1826
1827     if (buf)
1828     {
1829         // have destination buffer, convert there
1830         cres = iconv(m2w,
1831                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1832                      (char**)&bufPtr, &outbuf);
1833         res = n - (outbuf / SIZEOF_WCHAR_T);
1834
1835         if (ms_wcNeedsSwap)
1836         {
1837             // convert to native endianness
1838             for ( unsigned i = 0; i < res; i++ )
1839                 buf[n] = WC_BSWAP(buf[i]);
1840         }
1841
1842         // NUL-terminate the string if there is any space left
1843         if (res < n)
1844             buf[res] = 0;
1845     }
1846     else
1847     {
1848         // no destination buffer... convert using temp buffer
1849         // to calculate destination buffer requirement
1850         wchar_t tbuf[8];
1851         res = 0;
1852
1853         do
1854         {
1855             bufPtr = tbuf;
1856             outbuf = 8 * SIZEOF_WCHAR_T;
1857
1858             cres = iconv(m2w,
1859                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1860                          (char**)&bufPtr, &outbuf );
1861
1862             res += 8 - (outbuf / SIZEOF_WCHAR_T);
1863         }
1864         while ((cres == (size_t)-1) && (errno == E2BIG));
1865     }
1866
1867     if (ICONV_FAILED(cres, inbuf))
1868     {
1869         //VS: it is ok if iconv fails, hence trace only
1870         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1871         return wxCONV_FAILED;
1872     }
1873
1874     return res;
1875 }
1876
1877 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1878 {
1879 #if wxUSE_THREADS
1880     // NB: explained in MB2WC
1881     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1882 #endif
1883
1884     size_t inlen = wxWcslen(psz);
1885     size_t inbuf = inlen * SIZEOF_WCHAR_T;
1886     size_t outbuf = n;
1887     size_t res, cres;
1888
1889     wchar_t *tmpbuf = 0;
1890
1891     if (ms_wcNeedsSwap)
1892     {
1893         // need to copy to temp buffer to switch endianness
1894         // (doing WC_BSWAP twice on the original buffer won't help, as it
1895         //  could be in read-only memory, or be accessed in some other thread)
1896         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
1897         for ( size_t i = 0; i < inlen; i++ )
1898             tmpbuf[n] = WC_BSWAP(psz[i]);
1899
1900         tmpbuf[inlen] = L'\0';
1901         psz = tmpbuf;
1902     }
1903
1904     if (buf)
1905     {
1906         // have destination buffer, convert there
1907         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1908
1909         res = n - outbuf;
1910
1911         // NB: iconv was given only wcslen(psz) characters on input, and so
1912         //     it couldn't convert the trailing zero. Let's do it ourselves
1913         //     if there's some room left for it in the output buffer.
1914         if (res < n)
1915             buf[0] = 0;
1916     }
1917     else
1918     {
1919         // no destination buffer: convert using temp buffer
1920         // to calculate destination buffer requirement
1921         char tbuf[16];
1922         res = 0;
1923         do
1924         {
1925             buf = tbuf;
1926             outbuf = 16;
1927
1928             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1929
1930             res += 16 - outbuf;
1931         }
1932         while ((cres == (size_t)-1) && (errno == E2BIG));
1933     }
1934
1935     if (ms_wcNeedsSwap)
1936     {
1937         free(tmpbuf);
1938     }
1939
1940     if (ICONV_FAILED(cres, inbuf))
1941     {
1942         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1943         return wxCONV_FAILED;
1944     }
1945
1946     return res;
1947 }
1948
1949 size_t wxMBConv_iconv::GetMBNulLen() const
1950 {
1951     if ( m_minMBCharWidth == 0 )
1952     {
1953         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
1954
1955 #if wxUSE_THREADS
1956         // NB: explained in MB2WC
1957         wxMutexLocker lock(self->m_iconvMutex);
1958 #endif
1959
1960         wchar_t *wnul = L"";
1961         char buf[8]; // should be enough for NUL in any encoding
1962         size_t inLen = sizeof(wchar_t),
1963                outLen = WXSIZEOF(buf);
1964         char *inBuff = (char *)wnul;
1965         char *outBuff = buf;
1966         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
1967         {
1968             self->m_minMBCharWidth = (size_t)-1;
1969         }
1970         else // ok
1971         {
1972             self->m_minMBCharWidth = outBuff - buf;
1973         }
1974     }
1975
1976     return m_minMBCharWidth;
1977 }
1978
1979 #endif // HAVE_ICONV
1980
1981
1982 // ============================================================================
1983 // Win32 conversion classes
1984 // ============================================================================
1985
1986 #ifdef wxHAVE_WIN32_MB2WC
1987
1988 // from utils.cpp
1989 #if wxUSE_FONTMAP
1990 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1991 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1992 #endif
1993
1994 class wxMBConv_win32 : public wxMBConv
1995 {
1996 public:
1997     wxMBConv_win32()
1998     {
1999         m_CodePage = CP_ACP;
2000         m_minMBCharWidth = 0;
2001     }
2002
2003     wxMBConv_win32(const wxMBConv_win32& conv)
2004         : wxMBConv()
2005     {
2006         m_CodePage = conv.m_CodePage;
2007         m_minMBCharWidth = conv.m_minMBCharWidth;
2008     }
2009
2010 #if wxUSE_FONTMAP
2011     wxMBConv_win32(const wxChar* name)
2012     {
2013         m_CodePage = wxCharsetToCodepage(name);
2014         m_minMBCharWidth = 0;
2015     }
2016
2017     wxMBConv_win32(wxFontEncoding encoding)
2018     {
2019         m_CodePage = wxEncodingToCodepage(encoding);
2020         m_minMBCharWidth = 0;
2021     }
2022 #endif // wxUSE_FONTMAP
2023
2024     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2025     {
2026         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2027         // the behaviour is not compatible with the Unix version (using iconv)
2028         // and break the library itself, e.g. wxTextInputStream::NextChar()
2029         // wouldn't work if reading an incomplete MB char didn't result in an
2030         // error
2031         //
2032         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2033         // Win XP or newer and it is not supported for UTF-[78] so we always
2034         // use our own conversions in this case. See
2035         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2036         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2037         if ( m_CodePage == CP_UTF8 )
2038         {
2039             return wxConvUTF8.MB2WC(buf, psz, n);
2040         }
2041
2042         if ( m_CodePage == CP_UTF7 )
2043         {
2044             return wxConvUTF7.MB2WC(buf, psz, n);
2045         }
2046
2047         int flags = 0;
2048         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2049                 IsAtLeastWin2kSP4() )
2050         {
2051             flags = MB_ERR_INVALID_CHARS;
2052         }
2053
2054         const size_t len = ::MultiByteToWideChar
2055                              (
2056                                 m_CodePage,     // code page
2057                                 flags,          // flags: fall on error
2058                                 psz,            // input string
2059                                 -1,             // its length (NUL-terminated)
2060                                 buf,            // output string
2061                                 buf ? n : 0     // size of output buffer
2062                              );
2063         if ( !len )
2064         {
2065             // function totally failed
2066             return wxCONV_FAILED;
2067         }
2068
2069         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2070         // check if we succeeded, by doing a double trip:
2071         if ( !flags && buf )
2072         {
2073             const size_t mbLen = strlen(psz);
2074             wxCharBuffer mbBuf(mbLen);
2075             if ( ::WideCharToMultiByte
2076                    (
2077                       m_CodePage,
2078                       0,
2079                       buf,
2080                       -1,
2081                       mbBuf.data(),
2082                       mbLen + 1,        // size in bytes, not length
2083                       NULL,
2084                       NULL
2085                    ) == 0 ||
2086                   strcmp(mbBuf, psz) != 0 )
2087             {
2088                 // we didn't obtain the same thing we started from, hence
2089                 // the conversion was lossy and we consider that it failed
2090                 return wxCONV_FAILED;
2091             }
2092         }
2093
2094         // note that it returns count of written chars for buf != NULL and size
2095         // of the needed buffer for buf == NULL so in either case the length of
2096         // the string (which never includes the terminating NUL) is one less
2097         return len - 1;
2098     }
2099
2100     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2101     {
2102         /*
2103             we have a problem here: by default, WideCharToMultiByte() may
2104             replace characters unrepresentable in the target code page with bad
2105             quality approximations such as turning "1/2" symbol (U+00BD) into
2106             "1" for the code pages which don't have it and we, obviously, want
2107             to avoid this at any price
2108
2109             the trouble is that this function does it _silently_, i.e. it won't
2110             even tell us whether it did or not... Win98/2000 and higher provide
2111             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2112             we have to resort to a round trip, i.e. check that converting back
2113             results in the same string -- this is, of course, expensive but
2114             otherwise we simply can't be sure to not garble the data.
2115          */
2116
2117         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2118         // it doesn't work with CJK encodings (which we test for rather roughly
2119         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2120         // supporting it
2121         BOOL usedDef wxDUMMY_INITIALIZE(false);
2122         BOOL *pUsedDef;
2123         int flags;
2124         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2125         {
2126             // it's our lucky day
2127             flags = WC_NO_BEST_FIT_CHARS;
2128             pUsedDef = &usedDef;
2129         }
2130         else // old system or unsupported encoding
2131         {
2132             flags = 0;
2133             pUsedDef = NULL;
2134         }
2135
2136         const size_t len = ::WideCharToMultiByte
2137                              (
2138                                 m_CodePage,     // code page
2139                                 flags,          // either none or no best fit
2140                                 pwz,            // input string
2141                                 -1,             // it is (wide) NUL-terminated
2142                                 buf,            // output buffer
2143                                 buf ? n : 0,    // and its size
2144                                 NULL,           // default "replacement" char
2145                                 pUsedDef        // [out] was it used?
2146                              );
2147
2148         if ( !len )
2149         {
2150             // function totally failed
2151             return wxCONV_FAILED;
2152         }
2153
2154         // if we were really converting, check if we succeeded
2155         if ( buf )
2156         {
2157             if ( flags )
2158             {
2159                 // check if the conversion failed, i.e. if any replacements
2160                 // were done
2161                 if ( usedDef )
2162                     return wxCONV_FAILED;
2163             }
2164             else // we must resort to double tripping...
2165             {
2166                 wxWCharBuffer wcBuf(n);
2167                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2168                         wcscmp(wcBuf, pwz) != 0 )
2169                 {
2170                     // we didn't obtain the same thing we started from, hence
2171                     // the conversion was lossy and we consider that it failed
2172                     return wxCONV_FAILED;
2173                 }
2174             }
2175         }
2176
2177         // see the comment above for the reason of "len - 1"
2178         return len - 1;
2179     }
2180
2181     virtual size_t GetMBNulLen() const
2182     {
2183         if ( m_minMBCharWidth == 0 )
2184         {
2185             int len = ::WideCharToMultiByte
2186                         (
2187                             m_CodePage,     // code page
2188                             0,              // no flags
2189                             L"",            // input string
2190                             1,              // translate just the NUL
2191                             NULL,           // output buffer
2192                             0,              // and its size
2193                             NULL,           // no replacement char
2194                             NULL            // [out] don't care if it was used
2195                         );
2196
2197             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2198             switch ( len )
2199             {
2200                 default:
2201                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2202                     self->m_minMBCharWidth = (size_t)-1;
2203                     break;
2204
2205                 case 0:
2206                     self->m_minMBCharWidth = (size_t)-1;
2207                     break;
2208
2209                 case 1:
2210                 case 2:
2211                 case 4:
2212                     self->m_minMBCharWidth = len;
2213                     break;
2214             }
2215         }
2216
2217         return m_minMBCharWidth;
2218     }
2219
2220     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2221
2222     bool IsOk() const { return m_CodePage != -1; }
2223
2224 private:
2225     static bool CanUseNoBestFit()
2226     {
2227         static int s_isWin98Or2k = -1;
2228
2229         if ( s_isWin98Or2k == -1 )
2230         {
2231             int verMaj, verMin;
2232             switch ( wxGetOsVersion(&verMaj, &verMin) )
2233             {
2234                 case wxOS_WINDOWS_9X:
2235                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2236                     break;
2237
2238                 case wxOS_WINDOWS_NT:
2239                     s_isWin98Or2k = verMaj >= 5;
2240                     break;
2241
2242                 default:
2243                     // unknown: be conservative by default
2244                     s_isWin98Or2k = 0;
2245                     break;
2246             }
2247
2248             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2249         }
2250
2251         return s_isWin98Or2k == 1;
2252     }
2253
2254     static bool IsAtLeastWin2kSP4()
2255     {
2256 #ifdef __WXWINCE__
2257         return false;
2258 #else
2259         static int s_isAtLeastWin2kSP4 = -1;
2260
2261         if ( s_isAtLeastWin2kSP4 == -1 )
2262         {
2263             OSVERSIONINFOEX ver;
2264
2265             memset(&ver, 0, sizeof(ver));
2266             ver.dwOSVersionInfoSize = sizeof(ver);
2267             GetVersionEx((OSVERSIONINFO*)&ver);
2268
2269             s_isAtLeastWin2kSP4 =
2270               ((ver.dwMajorVersion > 5) || // Vista+
2271                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2272                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2273                ver.wServicePackMajor >= 4)) // 2000 SP4+
2274               ? 1 : 0;
2275         }
2276
2277         return s_isAtLeastWin2kSP4 == 1;
2278 #endif
2279     }
2280
2281
2282     // the code page we're working with
2283     long m_CodePage;
2284
2285     // cached result of GetMBNulLen(), set to 0 initially meaning
2286     // "unknown"
2287     size_t m_minMBCharWidth;
2288 };
2289
2290 #endif // wxHAVE_WIN32_MB2WC
2291
2292 // ============================================================================
2293 // Cocoa conversion classes
2294 // ============================================================================
2295
2296 #if defined(__WXCOCOA__)
2297
2298 // RN: There is no UTF-32 support in either Core Foundation or Cocoa.
2299 // Strangely enough, internally Core Foundation uses
2300 // UTF-32 internally quite a bit - its just not public (yet).
2301
2302 #include <CoreFoundation/CFString.h>
2303 #include <CoreFoundation/CFStringEncodingExt.h>
2304
2305 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
2306 {
2307     CFStringEncoding enc = kCFStringEncodingInvalidId ;
2308
2309     switch (encoding)
2310     {
2311         case wxFONTENCODING_DEFAULT :
2312             enc = CFStringGetSystemEncoding();
2313             break ;
2314
2315         case wxFONTENCODING_ISO8859_1 :
2316             enc = kCFStringEncodingISOLatin1 ;
2317             break ;
2318         case wxFONTENCODING_ISO8859_2 :
2319             enc = kCFStringEncodingISOLatin2;
2320             break ;
2321         case wxFONTENCODING_ISO8859_3 :
2322             enc = kCFStringEncodingISOLatin3 ;
2323             break ;
2324         case wxFONTENCODING_ISO8859_4 :
2325             enc = kCFStringEncodingISOLatin4;
2326             break ;
2327         case wxFONTENCODING_ISO8859_5 :
2328             enc = kCFStringEncodingISOLatinCyrillic;
2329             break ;
2330         case wxFONTENCODING_ISO8859_6 :
2331             enc = kCFStringEncodingISOLatinArabic;
2332             break ;
2333         case wxFONTENCODING_ISO8859_7 :
2334             enc = kCFStringEncodingISOLatinGreek;
2335             break ;
2336         case wxFONTENCODING_ISO8859_8 :
2337             enc = kCFStringEncodingISOLatinHebrew;
2338             break ;
2339         case wxFONTENCODING_ISO8859_9 :
2340             enc = kCFStringEncodingISOLatin5;
2341             break ;
2342         case wxFONTENCODING_ISO8859_10 :
2343             enc = kCFStringEncodingISOLatin6;
2344             break ;
2345         case wxFONTENCODING_ISO8859_11 :
2346             enc = kCFStringEncodingISOLatinThai;
2347             break ;
2348         case wxFONTENCODING_ISO8859_13 :
2349             enc = kCFStringEncodingISOLatin7;
2350             break ;
2351         case wxFONTENCODING_ISO8859_14 :
2352             enc = kCFStringEncodingISOLatin8;
2353             break ;
2354         case wxFONTENCODING_ISO8859_15 :
2355             enc = kCFStringEncodingISOLatin9;
2356             break ;
2357
2358         case wxFONTENCODING_KOI8 :
2359             enc = kCFStringEncodingKOI8_R;
2360             break ;
2361         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
2362             enc = kCFStringEncodingDOSRussian;
2363             break ;
2364
2365 //      case wxFONTENCODING_BULGARIAN :
2366 //          enc = ;
2367 //          break ;
2368
2369         case wxFONTENCODING_CP437 :
2370             enc = kCFStringEncodingDOSLatinUS ;
2371             break ;
2372         case wxFONTENCODING_CP850 :
2373             enc = kCFStringEncodingDOSLatin1;
2374             break ;
2375         case wxFONTENCODING_CP852 :
2376             enc = kCFStringEncodingDOSLatin2;
2377             break ;
2378         case wxFONTENCODING_CP855 :
2379             enc = kCFStringEncodingDOSCyrillic;
2380             break ;
2381         case wxFONTENCODING_CP866 :
2382             enc = kCFStringEncodingDOSRussian ;
2383             break ;
2384         case wxFONTENCODING_CP874 :
2385             enc = kCFStringEncodingDOSThai;
2386             break ;
2387         case wxFONTENCODING_CP932 :
2388             enc = kCFStringEncodingDOSJapanese;
2389             break ;
2390         case wxFONTENCODING_CP936 :
2391             enc = kCFStringEncodingDOSChineseSimplif ;
2392             break ;
2393         case wxFONTENCODING_CP949 :
2394             enc = kCFStringEncodingDOSKorean;
2395             break ;
2396         case wxFONTENCODING_CP950 :
2397             enc = kCFStringEncodingDOSChineseTrad;
2398             break ;
2399         case wxFONTENCODING_CP1250 :
2400             enc = kCFStringEncodingWindowsLatin2;
2401             break ;
2402         case wxFONTENCODING_CP1251 :
2403             enc = kCFStringEncodingWindowsCyrillic ;
2404             break ;
2405         case wxFONTENCODING_CP1252 :
2406             enc = kCFStringEncodingWindowsLatin1 ;
2407             break ;
2408         case wxFONTENCODING_CP1253 :
2409             enc = kCFStringEncodingWindowsGreek;
2410             break ;
2411         case wxFONTENCODING_CP1254 :
2412             enc = kCFStringEncodingWindowsLatin5;
2413             break ;
2414         case wxFONTENCODING_CP1255 :
2415             enc = kCFStringEncodingWindowsHebrew ;
2416             break ;
2417         case wxFONTENCODING_CP1256 :
2418             enc = kCFStringEncodingWindowsArabic ;
2419             break ;
2420         case wxFONTENCODING_CP1257 :
2421             enc = kCFStringEncodingWindowsBalticRim;
2422             break ;
2423 //   This only really encodes to UTF7 (if that) evidently
2424 //        case wxFONTENCODING_UTF7 :
2425 //            enc = kCFStringEncodingNonLossyASCII ;
2426 //            break ;
2427         case wxFONTENCODING_UTF8 :
2428             enc = kCFStringEncodingUTF8 ;
2429             break ;
2430         case wxFONTENCODING_EUC_JP :
2431             enc = kCFStringEncodingEUC_JP;
2432             break ;
2433         case wxFONTENCODING_UTF16 :
2434             enc = kCFStringEncodingUnicode ;
2435             break ;
2436         case wxFONTENCODING_MACROMAN :
2437             enc = kCFStringEncodingMacRoman ;
2438             break ;
2439         case wxFONTENCODING_MACJAPANESE :
2440             enc = kCFStringEncodingMacJapanese ;
2441             break ;
2442         case wxFONTENCODING_MACCHINESETRAD :
2443             enc = kCFStringEncodingMacChineseTrad ;
2444             break ;
2445         case wxFONTENCODING_MACKOREAN :
2446             enc = kCFStringEncodingMacKorean ;
2447             break ;
2448         case wxFONTENCODING_MACARABIC :
2449             enc = kCFStringEncodingMacArabic ;
2450             break ;
2451         case wxFONTENCODING_MACHEBREW :
2452             enc = kCFStringEncodingMacHebrew ;
2453             break ;
2454         case wxFONTENCODING_MACGREEK :
2455             enc = kCFStringEncodingMacGreek ;
2456             break ;
2457         case wxFONTENCODING_MACCYRILLIC :
2458             enc = kCFStringEncodingMacCyrillic ;
2459             break ;
2460         case wxFONTENCODING_MACDEVANAGARI :
2461             enc = kCFStringEncodingMacDevanagari ;
2462             break ;
2463         case wxFONTENCODING_MACGURMUKHI :
2464             enc = kCFStringEncodingMacGurmukhi ;
2465             break ;
2466         case wxFONTENCODING_MACGUJARATI :
2467             enc = kCFStringEncodingMacGujarati ;
2468             break ;
2469         case wxFONTENCODING_MACORIYA :
2470             enc = kCFStringEncodingMacOriya ;
2471             break ;
2472         case wxFONTENCODING_MACBENGALI :
2473             enc = kCFStringEncodingMacBengali ;
2474             break ;
2475         case wxFONTENCODING_MACTAMIL :
2476             enc = kCFStringEncodingMacTamil ;
2477             break ;
2478         case wxFONTENCODING_MACTELUGU :
2479             enc = kCFStringEncodingMacTelugu ;
2480             break ;
2481         case wxFONTENCODING_MACKANNADA :
2482             enc = kCFStringEncodingMacKannada ;
2483             break ;
2484         case wxFONTENCODING_MACMALAJALAM :
2485             enc = kCFStringEncodingMacMalayalam ;
2486             break ;
2487         case wxFONTENCODING_MACSINHALESE :
2488             enc = kCFStringEncodingMacSinhalese ;
2489             break ;
2490         case wxFONTENCODING_MACBURMESE :
2491             enc = kCFStringEncodingMacBurmese ;
2492             break ;
2493         case wxFONTENCODING_MACKHMER :
2494             enc = kCFStringEncodingMacKhmer ;
2495             break ;
2496         case wxFONTENCODING_MACTHAI :
2497             enc = kCFStringEncodingMacThai ;
2498             break ;
2499         case wxFONTENCODING_MACLAOTIAN :
2500             enc = kCFStringEncodingMacLaotian ;
2501             break ;
2502         case wxFONTENCODING_MACGEORGIAN :
2503             enc = kCFStringEncodingMacGeorgian ;
2504             break ;
2505         case wxFONTENCODING_MACARMENIAN :
2506             enc = kCFStringEncodingMacArmenian ;
2507             break ;
2508         case wxFONTENCODING_MACCHINESESIMP :
2509             enc = kCFStringEncodingMacChineseSimp ;
2510             break ;
2511         case wxFONTENCODING_MACTIBETAN :
2512             enc = kCFStringEncodingMacTibetan ;
2513             break ;
2514         case wxFONTENCODING_MACMONGOLIAN :
2515             enc = kCFStringEncodingMacMongolian ;
2516             break ;
2517         case wxFONTENCODING_MACETHIOPIC :
2518             enc = kCFStringEncodingMacEthiopic ;
2519             break ;
2520         case wxFONTENCODING_MACCENTRALEUR :
2521             enc = kCFStringEncodingMacCentralEurRoman ;
2522             break ;
2523         case wxFONTENCODING_MACVIATNAMESE :
2524             enc = kCFStringEncodingMacVietnamese ;
2525             break ;
2526         case wxFONTENCODING_MACARABICEXT :
2527             enc = kCFStringEncodingMacExtArabic ;
2528             break ;
2529         case wxFONTENCODING_MACSYMBOL :
2530             enc = kCFStringEncodingMacSymbol ;
2531             break ;
2532         case wxFONTENCODING_MACDINGBATS :
2533             enc = kCFStringEncodingMacDingbats ;
2534             break ;
2535         case wxFONTENCODING_MACTURKISH :
2536             enc = kCFStringEncodingMacTurkish ;
2537             break ;
2538         case wxFONTENCODING_MACCROATIAN :
2539             enc = kCFStringEncodingMacCroatian ;
2540             break ;
2541         case wxFONTENCODING_MACICELANDIC :
2542             enc = kCFStringEncodingMacIcelandic ;
2543             break ;
2544         case wxFONTENCODING_MACROMANIAN :
2545             enc = kCFStringEncodingMacRomanian ;
2546             break ;
2547         case wxFONTENCODING_MACCELTIC :
2548             enc = kCFStringEncodingMacCeltic ;
2549             break ;
2550         case wxFONTENCODING_MACGAELIC :
2551             enc = kCFStringEncodingMacGaelic ;
2552             break ;
2553 //      case wxFONTENCODING_MACKEYBOARD :
2554 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
2555 //          break ;
2556
2557         default :
2558             // because gcc is picky
2559             break ;
2560     }
2561
2562     return enc ;
2563 }
2564
2565 class wxMBConv_cocoa : public wxMBConv
2566 {
2567 public:
2568     wxMBConv_cocoa()
2569     {
2570         Init(CFStringGetSystemEncoding()) ;
2571     }
2572
2573     wxMBConv_cocoa(const wxMBConv_cocoa& conv)
2574     {
2575         m_encoding = conv.m_encoding;
2576     }
2577
2578 #if wxUSE_FONTMAP
2579     wxMBConv_cocoa(const wxChar* name)
2580     {
2581         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2582     }
2583 #endif
2584
2585     wxMBConv_cocoa(wxFontEncoding encoding)
2586     {
2587         Init( wxCFStringEncFromFontEnc(encoding) );
2588     }
2589
2590     virtual ~wxMBConv_cocoa()
2591     {
2592     }
2593
2594     void Init( CFStringEncoding encoding)
2595     {
2596         m_encoding = encoding ;
2597     }
2598
2599     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2600     {
2601         wxASSERT(szUnConv);
2602
2603         CFStringRef theString = CFStringCreateWithBytes (
2604                                                 NULL, //the allocator
2605                                                 (const UInt8*)szUnConv,
2606                                                 strlen(szUnConv),
2607                                                 m_encoding,
2608                                                 false //no BOM/external representation
2609                                                 );
2610
2611         wxASSERT(theString);
2612
2613         size_t nOutLength = CFStringGetLength(theString);
2614
2615         if (szOut == NULL)
2616         {
2617             CFRelease(theString);
2618             return nOutLength;
2619         }
2620
2621         CFRange theRange = { 0, nOutSize };
2622
2623 #if SIZEOF_WCHAR_T == 4
2624         UniChar* szUniCharBuffer = new UniChar[nOutSize];
2625 #endif
2626
2627         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2628
2629         CFRelease(theString);
2630
2631         szUniCharBuffer[nOutLength] = '\0';
2632
2633 #if SIZEOF_WCHAR_T == 4
2634         wxMBConvUTF16 converter;
2635         converter.MB2WC( szOut, (const char*)szUniCharBuffer, nOutSize );
2636         delete [] szUniCharBuffer;
2637 #endif
2638
2639         return nOutLength;
2640     }
2641
2642     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2643     {
2644         wxASSERT(szUnConv);
2645
2646         size_t nRealOutSize;
2647         size_t nBufSize = wxWcslen(szUnConv);
2648         UniChar* szUniBuffer = (UniChar*) szUnConv;
2649
2650 #if SIZEOF_WCHAR_T == 4
2651         wxMBConvUTF16 converter ;
2652         nBufSize = converter.WC2MB( NULL, szUnConv, 0 );
2653         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1];
2654         converter.WC2MB( (char*) szUniBuffer, szUnConv, nBufSize + sizeof(UniChar));
2655         nBufSize /= sizeof(UniChar);
2656 #endif
2657
2658         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2659                                 NULL, //allocator
2660                                 szUniBuffer,
2661                                 nBufSize,
2662                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2663                             );
2664
2665         wxASSERT(theString);
2666
2667         //Note that CER puts a BOM when converting to unicode
2668         //so we  check and use getchars instead in that case
2669         if (m_encoding == kCFStringEncodingUnicode)
2670         {
2671             if (szOut != NULL)
2672                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2673
2674             nRealOutSize = CFStringGetLength(theString) + 1;
2675         }
2676         else
2677         {
2678             CFStringGetBytes(
2679                 theString,
2680                 CFRangeMake(0, CFStringGetLength(theString)),
2681                 m_encoding,
2682                 0, //what to put in characters that can't be converted -
2683                     //0 tells CFString to return NULL if it meets such a character
2684                 false, //not an external representation
2685                 (UInt8*) szOut,
2686                 nOutSize,
2687                 (CFIndex*) &nRealOutSize
2688                         );
2689         }
2690
2691         CFRelease(theString);
2692
2693 #if SIZEOF_WCHAR_T == 4
2694         delete[] szUniBuffer;
2695 #endif
2696
2697         return  nRealOutSize - 1;
2698     }
2699
2700     virtual wxMBConv *Clone() const { return new wxMBConv_cocoa(*this); }
2701
2702     bool IsOk() const
2703     {
2704         return m_encoding != kCFStringEncodingInvalidId &&
2705               CFStringIsEncodingAvailable(m_encoding);
2706     }
2707
2708 private:
2709     CFStringEncoding m_encoding ;
2710 };
2711
2712 #endif // defined(__WXCOCOA__)
2713
2714 // ============================================================================
2715 // Mac conversion classes
2716 // ============================================================================
2717
2718 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2719
2720 class wxMBConv_mac : public wxMBConv
2721 {
2722 public:
2723     wxMBConv_mac()
2724     {
2725         Init(CFStringGetSystemEncoding()) ;
2726     }
2727
2728     wxMBConv_mac(const wxMBConv_mac& conv)
2729     {
2730         Init(conv.m_char_encoding);
2731     }
2732
2733 #if wxUSE_FONTMAP
2734     wxMBConv_mac(const wxChar* name)
2735     {
2736         Init( wxMacGetSystemEncFromFontEnc( wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) );
2737     }
2738 #endif
2739
2740     wxMBConv_mac(wxFontEncoding encoding)
2741     {
2742         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2743     }
2744
2745     virtual ~wxMBConv_mac()
2746     {
2747         OSStatus status = noErr ;
2748         if (m_MB2WC_converter)
2749             status = TECDisposeConverter(m_MB2WC_converter);
2750         if (m_WC2MB_converter)
2751             status = TECDisposeConverter(m_WC2MB_converter);
2752     }
2753
2754     void Init( TextEncodingBase encoding,TextEncodingVariant encodingVariant = kTextEncodingDefaultVariant ,
2755             TextEncodingFormat encodingFormat = kTextEncodingDefaultFormat)
2756     {
2757         m_MB2WC_converter = NULL ;
2758         m_WC2MB_converter = NULL ;
2759         m_char_encoding = CreateTextEncoding(encoding, encodingVariant, encodingFormat) ;
2760         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault, 0, kUnicode16BitFormat) ;
2761     }
2762
2763     virtual void CreateIfNeeded() const
2764     {
2765         if ( m_MB2WC_converter == NULL && m_WC2MB_converter == NULL )
2766         {
2767             OSStatus status = noErr ;
2768             status = TECCreateConverter(&m_MB2WC_converter,
2769                                     m_char_encoding,
2770                                     m_unicode_encoding);
2771             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2772             status = TECCreateConverter(&m_WC2MB_converter,
2773                                     m_unicode_encoding,
2774                                     m_char_encoding);
2775             wxASSERT_MSG( status == noErr , _("Unable to create TextEncodingConverter")) ;
2776         }
2777     }
2778
2779     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2780     {
2781         CreateIfNeeded() ;
2782         OSStatus status = noErr ;
2783         ByteCount byteOutLen ;
2784         ByteCount byteInLen = strlen(psz) + 1;
2785         wchar_t *tbuf = NULL ;
2786         UniChar* ubuf = NULL ;
2787         size_t res = 0 ;
2788
2789         if (buf == NULL)
2790         {
2791             // Apple specs say at least 32
2792             n = wxMax( 32, byteInLen ) ;
2793             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
2794         }
2795
2796         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2797
2798 #if SIZEOF_WCHAR_T == 4
2799         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2800 #else
2801         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2802 #endif
2803
2804         status = TECConvertText(
2805             m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
2806             (TextPtr) ubuf, byteBufferLen, &byteOutLen);
2807
2808 #if SIZEOF_WCHAR_T == 4
2809         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2810         // is not properly terminated we get random characters at the end
2811         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2812         wxMBConvUTF16 converter ;
2813         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
2814         free( ubuf ) ;
2815 #else
2816         res = byteOutLen / sizeof( UniChar ) ;
2817 #endif
2818
2819         if ( buf == NULL )
2820              free(tbuf) ;
2821
2822         if ( buf  && res < n)
2823             buf[res] = 0;
2824
2825         return res ;
2826     }
2827
2828     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2829     {
2830         CreateIfNeeded() ;
2831         OSStatus status = noErr ;
2832         ByteCount byteOutLen ;
2833         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2834
2835         char *tbuf = NULL ;
2836
2837         if (buf == NULL)
2838         {
2839             // Apple specs say at least 32
2840             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2841             tbuf = (char*) malloc( n ) ;
2842         }
2843
2844         ByteCount byteBufferLen = n ;
2845         UniChar* ubuf = NULL ;
2846
2847 #if SIZEOF_WCHAR_T == 4
2848         wxMBConvUTF16 converter ;
2849         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2850         byteInLen = unicharlen ;
2851         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2852         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2853 #else
2854         ubuf = (UniChar*) psz ;
2855 #endif
2856
2857         status = TECConvertText(
2858             m_WC2MB_converter, (ConstTextPtr) ubuf, byteInLen, &byteInLen,
2859             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2860
2861 #if SIZEOF_WCHAR_T == 4
2862         free( ubuf ) ;
2863 #endif
2864
2865         if ( buf == NULL )
2866             free(tbuf) ;
2867
2868         size_t res = byteOutLen ;
2869         if ( buf  && res < n)
2870         {
2871             buf[res] = 0;
2872
2873             //we need to double-trip to verify it didn't insert any ? in place
2874             //of bogus characters
2875             wxWCharBuffer wcBuf(n);
2876             size_t pszlen = wxWcslen(psz);
2877             if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2878                         wxWcslen(wcBuf) != pszlen ||
2879                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2880             {
2881                 // we didn't obtain the same thing we started from, hence
2882                 // the conversion was lossy and we consider that it failed
2883                 return wxCONV_FAILED;
2884             }
2885         }
2886
2887         return res ;
2888     }
2889
2890     virtual wxMBConv *Clone() const { return new wxMBConv_mac(*this); }
2891
2892     bool IsOk() const
2893     {
2894         CreateIfNeeded() ;
2895         return m_MB2WC_converter != NULL && m_WC2MB_converter != NULL;
2896     }
2897
2898 protected :
2899     mutable TECObjectRef m_MB2WC_converter;
2900     mutable TECObjectRef m_WC2MB_converter;
2901
2902     TextEncodingBase m_char_encoding;
2903     TextEncodingBase m_unicode_encoding;
2904 };
2905
2906 // MB is decomposed (D) normalized UTF8
2907
2908 class wxMBConv_macUTF8D : public wxMBConv_mac
2909 {
2910 public :
2911     wxMBConv_macUTF8D()
2912     {
2913         Init( kTextEncodingUnicodeDefault , kUnicodeNoSubset , kUnicodeUTF8Format ) ;
2914         m_uni = NULL;
2915         m_uniBack = NULL ;
2916     }
2917
2918     virtual ~wxMBConv_macUTF8D()
2919     {
2920         if (m_uni!=NULL)
2921             DisposeUnicodeToTextInfo(&m_uni);
2922         if (m_uniBack!=NULL)
2923             DisposeUnicodeToTextInfo(&m_uniBack);
2924     }
2925
2926     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2927     {
2928         CreateIfNeeded() ;
2929         OSStatus status = noErr ;
2930         ByteCount byteOutLen ;
2931         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2932
2933         char *tbuf = NULL ;
2934
2935         if (buf == NULL)
2936         {
2937             // Apple specs say at least 32
2938             n = wxMax( 32, ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2939             tbuf = (char*) malloc( n ) ;
2940         }
2941
2942         ByteCount byteBufferLen = n ;
2943         UniChar* ubuf = NULL ;
2944
2945 #if SIZEOF_WCHAR_T == 4
2946         wxMBConvUTF16 converter ;
2947         size_t unicharlen = converter.WC2MB( NULL, psz, 0 ) ;
2948         byteInLen = unicharlen ;
2949         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2950         converter.WC2MB( (char*) ubuf, psz, unicharlen + 2 ) ;
2951 #else
2952         ubuf = (UniChar*) psz ;
2953 #endif
2954
2955         // ubuf is a non-decomposed UniChar buffer
2956
2957         ByteCount dcubuflen = byteInLen * 2 + 2 ;
2958         ByteCount dcubufread , dcubufwritten ;
2959         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
2960
2961         ConvertFromUnicodeToText( m_uni , byteInLen , ubuf ,
2962             kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , dcubuf ) ;
2963
2964         // we now convert that decomposed buffer into UTF8
2965
2966         status = TECConvertText(
2967             m_WC2MB_converter, (ConstTextPtr) dcubuf, dcubufwritten, &dcubufread,
2968             (TextPtr) (buf ? buf : tbuf), byteBufferLen, &byteOutLen);
2969
2970         free( dcubuf );
2971
2972 #if SIZEOF_WCHAR_T == 4
2973         free( ubuf ) ;
2974 #endif
2975
2976         if ( buf == NULL )
2977             free(tbuf) ;
2978
2979         size_t res = byteOutLen ;
2980         if ( buf  && res < n)
2981         {
2982             buf[res] = 0;
2983             // don't test for round-trip fidelity yet, we cannot guarantee it yet
2984         }
2985
2986         return res ;
2987     }
2988
2989     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2990     {
2991         CreateIfNeeded() ;
2992         OSStatus status = noErr ;
2993         ByteCount byteOutLen ;
2994         ByteCount byteInLen = strlen(psz) + 1;
2995         wchar_t *tbuf = NULL ;
2996         UniChar* ubuf = NULL ;
2997         size_t res = 0 ;
2998
2999         if (buf == NULL)
3000         {
3001             // Apple specs say at least 32
3002             n = wxMax( 32, byteInLen ) ;
3003             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T ) ;
3004         }
3005
3006         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
3007
3008 #if SIZEOF_WCHAR_T == 4
3009         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
3010 #else
3011         ubuf = (UniChar*) (buf ? buf : tbuf) ;
3012 #endif
3013
3014         ByteCount dcubuflen = byteBufferLen * 2 + 2 ;
3015         ByteCount dcubufread , dcubufwritten ;
3016         UniChar *dcubuf = (UniChar*) malloc( dcubuflen ) ;
3017
3018         status = TECConvertText(
3019                                 m_MB2WC_converter, (ConstTextPtr) psz, byteInLen, &byteInLen,
3020                                 (TextPtr) dcubuf, dcubuflen, &byteOutLen);
3021         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
3022         // is not properly terminated we get random characters at the end
3023         dcubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3024
3025         // now from the decomposed UniChar to properly composed uniChar
3026         ConvertFromUnicodeToText( m_uniBack , byteOutLen , dcubuf ,
3027                                   kUnicodeDefaultDirectionMask, 0, NULL, NULL, NULL, dcubuflen  , &dcubufread , &dcubufwritten , ubuf ) ;
3028
3029         free( dcubuf );
3030         byteOutLen = dcubufwritten ;
3031         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
3032
3033
3034 #if SIZEOF_WCHAR_T == 4
3035         wxMBConvUTF16 converter ;
3036         res = converter.MB2WC( (buf ? buf : tbuf), (const char*)ubuf, n ) ;
3037         free( ubuf ) ;
3038 #else
3039         res = byteOutLen / sizeof( UniChar ) ;
3040 #endif
3041
3042         if ( buf == NULL )
3043             free(tbuf) ;
3044
3045         if ( buf  && res < n)
3046             buf[res] = 0;
3047
3048         return res ;
3049     }
3050
3051     virtual void CreateIfNeeded() const
3052     {
3053         wxMBConv_mac::CreateIfNeeded() ;
3054         if ( m_uni == NULL )
3055         {
3056             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3057                 kUnicodeNoSubset, kTextEncodingDefaultFormat);
3058             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3059                 kUnicodeCanonicalDecompVariant, kTextEncodingDefaultFormat);
3060             m_map.mappingVersion = kUnicodeUseLatestMapping;
3061
3062             OSStatus err = CreateUnicodeToTextInfo(&m_map, &m_uni);
3063             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3064
3065             m_map.unicodeEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3066                                                        kUnicodeNoSubset, kTextEncodingDefaultFormat);
3067             m_map.otherEncoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
3068                                                      kUnicodeCanonicalCompVariant, kTextEncodingDefaultFormat);
3069             m_map.mappingVersion = kUnicodeUseLatestMapping;
3070             err = CreateUnicodeToTextInfo(&m_map, &m_uniBack);
3071             wxASSERT_MSG( err == noErr , _(" Couldn't create the UnicodeConverter")) ;
3072         }
3073     }
3074 protected :
3075     mutable UnicodeToTextInfo   m_uni;
3076     mutable UnicodeToTextInfo   m_uniBack;
3077     mutable UnicodeMapping      m_map;
3078 };
3079 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
3080
3081 // ============================================================================
3082 // wxEncodingConverter based conversion classes
3083 // ============================================================================
3084
3085 #if wxUSE_FONTMAP
3086
3087 class wxMBConv_wxwin : public wxMBConv
3088 {
3089 private:
3090     void Init()
3091     {
3092         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
3093                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
3094     }
3095
3096 public:
3097     // temporarily just use wxEncodingConverter stuff,
3098     // so that it works while a better implementation is built
3099     wxMBConv_wxwin(const wxChar* name)
3100     {
3101         if (name)
3102             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3103         else
3104             m_enc = wxFONTENCODING_SYSTEM;
3105
3106         Init();
3107     }
3108
3109     wxMBConv_wxwin(wxFontEncoding enc)
3110     {
3111         m_enc = enc;
3112
3113         Init();
3114     }
3115
3116     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
3117     {
3118         size_t inbuf = strlen(psz);
3119         if (buf)
3120         {
3121             if (!m2w.Convert(psz, buf))
3122                 return wxCONV_FAILED;
3123         }
3124         return inbuf;
3125     }
3126
3127     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
3128     {
3129         const size_t inbuf = wxWcslen(psz);
3130         if (buf)
3131         {
3132             if (!w2m.Convert(psz, buf))
3133                 return wxCONV_FAILED;
3134         }
3135
3136         return inbuf;
3137     }
3138
3139     virtual size_t GetMBNulLen() const
3140     {
3141         switch ( m_enc )
3142         {
3143             case wxFONTENCODING_UTF16BE:
3144             case wxFONTENCODING_UTF16LE:
3145                 return 2;
3146
3147             case wxFONTENCODING_UTF32BE:
3148             case wxFONTENCODING_UTF32LE:
3149                 return 4;
3150
3151             default:
3152                 return 1;
3153         }
3154     }
3155
3156     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
3157
3158     bool IsOk() const { return m_ok; }
3159
3160 public:
3161     wxFontEncoding m_enc;
3162     wxEncodingConverter m2w, w2m;
3163
3164 private:
3165     // were we initialized successfully?
3166     bool m_ok;
3167
3168     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
3169 };
3170
3171 // make the constructors available for unit testing
3172 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const wxChar* name )
3173 {
3174     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
3175     if ( !result->IsOk() )
3176     {
3177         delete result;
3178         return 0;
3179     }
3180
3181     return result;
3182 }
3183
3184 #endif // wxUSE_FONTMAP
3185
3186 // ============================================================================
3187 // wxCSConv implementation
3188 // ============================================================================
3189
3190 void wxCSConv::Init()
3191 {
3192     m_name = NULL;
3193     m_convReal =  NULL;
3194     m_deferred = true;
3195 }
3196
3197 wxCSConv::wxCSConv(const wxChar *charset)
3198 {
3199     Init();
3200
3201     if ( charset )
3202     {
3203         SetName(charset);
3204     }
3205
3206 #if wxUSE_FONTMAP
3207     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
3208 #else
3209     m_encoding = wxFONTENCODING_SYSTEM;
3210 #endif
3211 }
3212
3213 wxCSConv::wxCSConv(wxFontEncoding encoding)
3214 {
3215     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
3216     {
3217         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
3218
3219         encoding = wxFONTENCODING_SYSTEM;
3220     }
3221
3222     Init();
3223
3224     m_encoding = encoding;
3225 }
3226
3227 wxCSConv::~wxCSConv()
3228 {
3229     Clear();
3230 }
3231
3232 wxCSConv::wxCSConv(const wxCSConv& conv)
3233         : wxMBConv()
3234 {
3235     Init();
3236
3237     SetName(conv.m_name);
3238     m_encoding = conv.m_encoding;
3239 }
3240
3241 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
3242 {
3243     Clear();
3244
3245     SetName(conv.m_name);
3246     m_encoding = conv.m_encoding;
3247
3248     return *this;
3249 }
3250
3251 void wxCSConv::Clear()
3252 {
3253     free(m_name);
3254     delete m_convReal;
3255
3256     m_name = NULL;
3257     m_convReal = NULL;
3258 }
3259
3260 void wxCSConv::SetName(const wxChar *charset)
3261 {
3262     if (charset)
3263     {
3264         m_name = wxStrdup(charset);
3265         m_deferred = true;
3266     }
3267 }
3268
3269 #if wxUSE_FONTMAP
3270
3271 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3272                      wxEncodingNameCache );
3273
3274 static wxEncodingNameCache gs_nameCache;
3275 #endif
3276
3277 wxMBConv *wxCSConv::DoCreate() const
3278 {
3279 #if wxUSE_FONTMAP
3280     wxLogTrace(TRACE_STRCONV,
3281                wxT("creating conversion for %s"),
3282                (m_name ? m_name
3283                        : wxFontMapperBase::GetEncodingName(m_encoding).c_str()));
3284 #endif // wxUSE_FONTMAP
3285
3286     // check for the special case of ASCII or ISO8859-1 charset: as we have
3287     // special knowledge of it anyhow, we don't need to create a special
3288     // conversion object
3289     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3290             m_encoding == wxFONTENCODING_DEFAULT )
3291     {
3292         // don't convert at all
3293         return NULL;
3294     }
3295
3296     // we trust OS to do conversion better than we can so try external
3297     // conversion methods first
3298     //
3299     // the full order is:
3300     //      1. OS conversion (iconv() under Unix or Win32 API)
3301     //      2. hard coded conversions for UTF
3302     //      3. wxEncodingConverter as fall back
3303
3304     // step (1)
3305 #ifdef HAVE_ICONV
3306 #if !wxUSE_FONTMAP
3307     if ( m_name )
3308 #endif // !wxUSE_FONTMAP
3309     {
3310         wxString name(m_name);
3311 #if wxUSE_FONTMAP
3312         wxFontEncoding encoding(m_encoding);
3313 #endif
3314
3315         if ( !name.empty() )
3316         {
3317             wxMBConv_iconv *conv = new wxMBConv_iconv(name);
3318             if ( conv->IsOk() )
3319                 return conv;
3320
3321             delete conv;
3322
3323 #if wxUSE_FONTMAP
3324             encoding =
3325                 wxFontMapperBase::Get()->CharsetToEncoding(name, false);
3326 #endif // wxUSE_FONTMAP
3327         }
3328 #if wxUSE_FONTMAP
3329         {
3330             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3331             if ( it != gs_nameCache.end() )
3332             {
3333                 if ( it->second.empty() )
3334                     return NULL;
3335
3336                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second);
3337                 if ( conv->IsOk() )
3338                     return conv;
3339
3340                 delete conv;
3341             }
3342
3343             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3344             // CS : in case this does not return valid names (eg for MacRoman) encoding
3345             // got a 'failure' entry in the cache all the same, although it just has to
3346             // be created using a different method, so only store failed iconv creation
3347             // attempts (or perhaps we shoulnd't do this at all ?)
3348             if ( names[0] != NULL )
3349             {
3350                 for ( ; *names; ++names )
3351                 {
3352                     wxMBConv_iconv *conv = new wxMBConv_iconv(*names);
3353                     if ( conv->IsOk() )
3354                     {
3355                         gs_nameCache[encoding] = *names;
3356                         return conv;
3357                     }
3358
3359                     delete conv;
3360                 }
3361
3362                 gs_nameCache[encoding] = _T(""); // cache the failure
3363             }
3364         }
3365 #endif // wxUSE_FONTMAP
3366     }
3367 #endif // HAVE_ICONV
3368
3369 #ifdef wxHAVE_WIN32_MB2WC
3370     {
3371 #if wxUSE_FONTMAP
3372         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3373                                       : new wxMBConv_win32(m_encoding);
3374         if ( conv->IsOk() )
3375             return conv;
3376
3377         delete conv;
3378 #else
3379         return NULL;
3380 #endif
3381     }
3382 #endif // wxHAVE_WIN32_MB2WC
3383
3384 #if defined(__WXMAC__)
3385     {
3386         // leave UTF16 and UTF32 to the built-ins of wx
3387         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3388             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3389         {
3390 #if wxUSE_FONTMAP
3391             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
3392                                         : new wxMBConv_mac(m_encoding);
3393 #else
3394             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
3395 #endif
3396             if ( conv->IsOk() )
3397                  return conv;
3398
3399             delete conv;
3400         }
3401     }
3402 #endif
3403
3404 #if defined(__WXCOCOA__)
3405     {
3406         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
3407         {
3408 #if wxUSE_FONTMAP
3409             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
3410                                           : new wxMBConv_cocoa(m_encoding);
3411 #else
3412             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
3413 #endif
3414
3415             if ( conv->IsOk() )
3416                  return conv;
3417
3418             delete conv;
3419         }
3420     }
3421 #endif
3422     // step (2)
3423     wxFontEncoding enc = m_encoding;
3424 #if wxUSE_FONTMAP
3425     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3426     {
3427         // use "false" to suppress interactive dialogs -- we can be called from
3428         // anywhere and popping up a dialog from here is the last thing we want to
3429         // do
3430         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3431     }
3432 #endif // wxUSE_FONTMAP
3433
3434     switch ( enc )
3435     {
3436         case wxFONTENCODING_UTF7:
3437              return new wxMBConvUTF7;
3438
3439         case wxFONTENCODING_UTF8:
3440              return new wxMBConvUTF8;
3441
3442         case wxFONTENCODING_UTF16BE:
3443              return new wxMBConvUTF16BE;
3444
3445         case wxFONTENCODING_UTF16LE:
3446              return new wxMBConvUTF16LE;
3447
3448         case wxFONTENCODING_UTF32BE:
3449              return new wxMBConvUTF32BE;
3450
3451         case wxFONTENCODING_UTF32LE:
3452              return new wxMBConvUTF32LE;
3453
3454         default:
3455              // nothing to do but put here to suppress gcc warnings
3456              break;
3457     }
3458
3459     // step (3)
3460 #if wxUSE_FONTMAP
3461     {
3462         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3463                                       : new wxMBConv_wxwin(m_encoding);
3464         if ( conv->IsOk() )
3465             return conv;
3466
3467         delete conv;
3468     }
3469 #endif // wxUSE_FONTMAP
3470
3471     // NB: This is a hack to prevent deadlock. What could otherwise happen
3472     //     in Unicode build: wxConvLocal creation ends up being here
3473     //     because of some failure and logs the error. But wxLog will try to
3474     //     attach a timestamp, for which it will need wxConvLocal (to convert
3475     //     time to char* and then wchar_t*), but that fails, tries to log the
3476     //     error, but wxLog has an (already locked) critical section that
3477     //     guards the static buffer.
3478     static bool alreadyLoggingError = false;
3479     if (!alreadyLoggingError)
3480     {
3481         alreadyLoggingError = true;
3482         wxLogError(_("Cannot convert from the charset '%s'!"),
3483                    m_name ? m_name
3484                       :
3485 #if wxUSE_FONTMAP
3486                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
3487 #else // !wxUSE_FONTMAP
3488                          wxString::Format(_("encoding %i"), m_encoding).c_str()
3489 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3490               );
3491
3492         alreadyLoggingError = false;
3493     }
3494
3495     return NULL;
3496 }
3497
3498 void wxCSConv::CreateConvIfNeeded() const
3499 {
3500     if ( m_deferred )
3501     {
3502         wxCSConv *self = (wxCSConv *)this; // const_cast
3503
3504         // if we don't have neither the name nor the encoding, use the default
3505         // encoding for this system
3506         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3507         {
3508 #if wxUSE_INTL
3509             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
3510 #else
3511             // fallback to some reasonable default:
3512             self->m_encoding = wxFONTENCODING_ISO8859_1;
3513 #endif // wxUSE_INTL
3514         }
3515
3516         self->m_convReal = DoCreate();
3517         self->m_deferred = false;
3518     }
3519 }
3520
3521 bool wxCSConv::IsOk() const
3522 {
3523     CreateConvIfNeeded();
3524
3525     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3526     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3527         return true; // always ok as we do it ourselves
3528
3529     // m_convReal->IsOk() is called at its own creation, so we know it must
3530     // be ok if m_convReal is non-NULL
3531     return m_convReal != NULL;
3532 }
3533
3534 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3535                          const char *src, size_t srcLen) const
3536 {
3537     CreateConvIfNeeded();
3538
3539     if (m_convReal)
3540         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3541
3542     // latin-1 (direct)
3543     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3544 }
3545
3546 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3547                            const wchar_t *src, size_t srcLen) const
3548 {
3549     CreateConvIfNeeded();
3550
3551     if (m_convReal)
3552         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3553
3554     // latin-1 (direct)
3555     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3556 }
3557
3558 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3559 {
3560     CreateConvIfNeeded();
3561
3562     if (m_convReal)
3563         return m_convReal->MB2WC(buf, psz, n);
3564
3565     // latin-1 (direct)
3566     size_t len = strlen(psz);
3567
3568     if (buf)
3569     {
3570         for (size_t c = 0; c <= len; c++)
3571             buf[c] = (unsigned char)(psz[c]);
3572     }
3573
3574     return len;
3575 }
3576
3577 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3578 {
3579     CreateConvIfNeeded();
3580
3581     if (m_convReal)
3582         return m_convReal->WC2MB(buf, psz, n);
3583
3584     // latin-1 (direct)
3585     const size_t len = wxWcslen(psz);
3586     if (buf)
3587     {
3588         for (size_t c = 0; c <= len; c++)
3589         {
3590             if (psz[c] > 0xFF)
3591                 return wxCONV_FAILED;
3592
3593             buf[c] = (char)psz[c];
3594         }
3595     }
3596     else
3597     {
3598         for (size_t c = 0; c <= len; c++)
3599         {
3600             if (psz[c] > 0xFF)
3601                 return wxCONV_FAILED;
3602         }
3603     }
3604
3605     return len;
3606 }
3607
3608 size_t wxCSConv::GetMBNulLen() const
3609 {
3610     CreateConvIfNeeded();
3611
3612     if ( m_convReal )
3613     {
3614         return m_convReal->GetMBNulLen();
3615     }
3616
3617     return 1;
3618 }
3619
3620 // ----------------------------------------------------------------------------
3621 // globals
3622 // ----------------------------------------------------------------------------
3623
3624 #ifdef __WINDOWS__
3625     static wxMBConv_win32 wxConvLibcObj;
3626 #elif defined(__WXMAC__) && !defined(__MACH__)
3627     static wxMBConv_mac wxConvLibcObj ;
3628 #else
3629     static wxMBConvLibc wxConvLibcObj;
3630 #endif
3631
3632 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
3633 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
3634 static wxMBConvUTF7 wxConvUTF7Obj;
3635 static wxMBConvUTF8 wxConvUTF8Obj;
3636 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3637 static wxMBConv_macUTF8D wxConvMacUTF8DObj;
3638 #endif
3639 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
3640 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
3641 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
3642 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
3643 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
3644 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
3645 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = &wxConvLocal;
3646 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
3647 #ifdef __WXOSX__
3648 #if defined(__WXMAC__) && defined(TARGET_CARBON)
3649                                     wxConvMacUTF8DObj;
3650 #else
3651                                     wxConvUTF8Obj;
3652 #endif
3653 #else // !__WXOSX__
3654                                     wxConvLibcObj;
3655 #endif // __WXOSX__/!__WXOSX__
3656
3657 #if wxUSE_UNICODE
3658
3659 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3660 {
3661     if ( !s )
3662         return wxWCharBuffer();
3663
3664     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3665     if ( !wbuf )
3666         wbuf = wxConvUTF8.cMB2WX(s);
3667     if ( !wbuf )
3668         wbuf = wxConvISO8859_1.cMB2WX(s);
3669
3670     return wbuf;
3671 }
3672
3673 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3674 {
3675     if ( !ws )
3676         return wxCharBuffer();
3677
3678     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3679     if ( !buf )
3680         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3681
3682     return buf;
3683 }
3684
3685 #endif // wxUSE_UNICODE
3686
3687 #else // !wxUSE_WCHAR_T
3688
3689 // stand-ins in absence of wchar_t
3690 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3691                                 wxConvISO8859_1,
3692                                 wxConvLocal,
3693                                 wxConvUTF8;
3694
3695 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T