src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/mac/corefoundation/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existins ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168
 169     // the number of chars [which would be] written to dst [if it were not NULL]
 170     size_t dstWritten = 0;
 171
 172     // the number of NULs terminating this string
 173     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 174
 175     // if we were not given the input size we just have to assume that the
 176     // string is properly terminated as we have no way of knowing how long it
 177     // is anyhow, but if we do have the size check whether there are enough
 178     // NULs at the end
 179     wxCharBuffer bufTmp;
 180     const char *srcEnd;
 181     if ( srcLen != wxNO_LEN )
 182     {
 183         // we need to know how to find the end of this string
 184         nulLen = GetMBNulLen();
 185         if ( nulLen == wxCONV_FAILED )
 186             return wxCONV_FAILED;
 187
 188         // if there are enough NULs we can avoid the copy
 189         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 190         {
 191             // make a copy in order to properly NUL-terminate the string
 192             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 193             char * const p = bufTmp.data();
 194             memcpy(p, src, srcLen);
 195             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 196                 *s = '\0';
 197
 198             src = bufTmp;
 199         }
 200
 201         srcEnd = src + srcLen;
 202     }
 203     else // quit after the first loop iteration
 204     {
 205         srcEnd = NULL;
 206     }
 207
 208     for ( ;; )
 209     {
 210         // try to convert the current chunk
 211         size_t lenChunk = MB2WC(NULL, src, 0);
 212         if ( lenChunk == wxCONV_FAILED )
 213             return wxCONV_FAILED;
 214
 215         lenChunk++; // for the L'\0' at the end of this chunk
 216
 217         dstWritten += lenChunk;
 218
 219         if ( lenChunk == 1 )
 220         {
 221             // nothing left in the input string, conversion succeeded
 222             break;
 223         }
 224
 225         if ( dst )
 226         {
 227             if ( dstWritten > dstLen )
 228                 return wxCONV_FAILED;
 229
 230             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 231                 return wxCONV_FAILED;
 232
 233             dst += lenChunk;
 234         }
 235
 236         if ( !srcEnd )
 237         {
 238             // we convert just one chunk in this case as this is the entire
 239             // string anyhow
 240             break;
 241         }
 242
 243         // advance the input pointer past the end of this chunk
 244         while ( NotAllNULs(src, nulLen) )
 245         {
 246             // notice that we must skip over multiple bytes here as we suppose
 247             // that if NUL takes 2 or 4 bytes, then all the other characters do
 248             // too and so if advanced by a single byte we might erroneously
 249             // detect sequences of NUL bytes in the middle of the input
 250             src += nulLen;
 251         }
 252
 253         src += nulLen; // skipping over its terminator as well
 254
 255         // note that ">=" (and not just "==") is needed here as the terminator
 256         // we skipped just above could be inside or just after the buffer
 257         // delimited by inEnd
 258         if ( src >= srcEnd )
 259             break;
 260     }
 261
 262     return dstWritten;
 263 }
 264
 265 size_t
 266 wxMBConv::FromWChar(char *dst, size_t dstLen,
 267                     const wchar_t *src, size_t srcLen) const
 268 {
 269     // the number of chars [which would be] written to dst [if it were not NULL]
 270     size_t dstWritten = 0;
 271
 272     // make a copy of the input string unless it is already properly
 273     // NUL-terminated
 274     //
 275     // if we don't know its length we have no choice but to assume that it is,
 276     // indeed, properly terminated
 277     wxWCharBuffer bufTmp;
 278     if ( srcLen == wxNO_LEN )
 279     {
 280         srcLen = wxWcslen(src) + 1;
 281     }
 282     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 283     {
 284         // make a copy in order to properly NUL-terminate the string
 285         bufTmp = wxWCharBuffer(srcLen);
 286         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 287         src = bufTmp;
 288     }
 289
 290     const size_t lenNul = GetMBNulLen();
 291     for ( const wchar_t * const srcEnd = src + srcLen;
 292           src < srcEnd;
 293           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 294     {
 295         // try to convert the current chunk
 296         size_t lenChunk = WC2MB(NULL, src, 0);
 297
 298         if ( lenChunk == wxCONV_FAILED )
 299             return wxCONV_FAILED;
 300
 301         lenChunk += lenNul;
 302         dstWritten += lenChunk;
 303
 304         if ( dst )
 305         {
 306             if ( dstWritten > dstLen )
 307                 return wxCONV_FAILED;
 308
 309             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 310                 return wxCONV_FAILED;
 311
 312             dst += lenChunk;
 313         }
 314     }
 315
 316     return dstWritten;
 317 }
 318
 319 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 320 {
 321     size_t rc = ToWChar(outBuff, outLen, inBuff);
 322     if ( rc != wxCONV_FAILED )
 323     {
 324         // ToWChar() returns the buffer length, i.e. including the trailing
 325         // NUL, while this method doesn't take it into account
 326         rc--;
 327     }
 328
 329     return rc;
 330 }
 331
 332 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 333 {
 334     size_t rc = FromWChar(outBuff, outLen, inBuff);
 335     if ( rc != wxCONV_FAILED )
 336     {
 337         rc -= GetMBNulLen();
 338     }
 339
 340     return rc;
 341 }
 342
 343 wxMBConv::~wxMBConv()
 344 {
 345     // nothing to do here (necessary for Darwin linking probably)
 346 }
 347
 348 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 349 {
 350     if ( psz )
 351     {
 352         // calculate the length of the buffer needed first
 353         const size_t nLen = ToWChar(NULL, 0, psz);
 354         if ( nLen != wxCONV_FAILED )
 355         {
 356             // now do the actual conversion
 357             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 358
 359             // +1 for the trailing NULL
 360             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 361                 return buf;
 362         }
 363     }
 364
 365     return wxWCharBuffer();
 366 }
 367
 368 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 369 {
 370     if ( pwz )
 371     {
 372         const size_t nLen = FromWChar(NULL, 0, pwz);
 373         if ( nLen != wxCONV_FAILED )
 374         {
 375             wxCharBuffer buf(nLen - 1);
 376             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 377                 return buf;
 378         }
 379     }
 380
 381     return wxCharBuffer();
 382 }
 383
 384 const wxWCharBuffer
 385 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 386 {
 387     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 388     if ( dstLen != wxCONV_FAILED )
 389     {
 390         // notice that we allocate space for dstLen+1 wide characters here
 391         // because we want the buffer to always be NUL-terminated, even if the
 392         // input isn't (as otherwise the caller has no way to know its length)
 393         wxWCharBuffer wbuf(dstLen);
 394         wbuf.data()[dstLen - 1] = L'\0';
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         const size_t nulLen = GetMBNulLen();
 421
 422         // as above, ensure that the buffer is always NUL-terminated, even if
 423         // the input is not
 424         wxCharBuffer buf(dstLen + nulLen - 1);
 425         memset(buf.data() + dstLen, 0, nulLen);
 426         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 427         {
 428             if ( outLen )
 429             {
 430                 *outLen = dstLen;
 431
 432                 if ( dstLen >= nulLen &&
 433                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 434                 {
 435                     // in this case the output is NUL-terminated and we're not
 436                     // supposed to count NUL
 437                     *outLen -= nulLen;
 438                 }
 439             }
 440
 441             return buf;
 442         }
 443     }
 444
 445     if ( outLen )
 446         *outLen = 0;
 447
 448     return wxCharBuffer();
 449 }
 450
 451 // ----------------------------------------------------------------------------
 452 // wxMBConvLibc
 453 // ----------------------------------------------------------------------------
 454
 455 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 456 {
 457     return wxMB2WC(buf, psz, n);
 458 }
 459
 460 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 461 {
 462     return wxWC2MB(buf, psz, n);
 463 }
 464
 465 // ----------------------------------------------------------------------------
 466 // wxConvBrokenFileNames
 467 // ----------------------------------------------------------------------------
 468
 469 #ifdef __UNIX__
 470
 471 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 472 {
 473     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 474          wxStricmp(charset, _T("UTF8")) == 0  )
 475         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 476     else
 477         m_conv = new wxCSConv(charset);
 478 }
 479
 480 #endif // __UNIX__
 481
 482 // ----------------------------------------------------------------------------
 483 // UTF-7
 484 // ----------------------------------------------------------------------------
 485
 486 // Implementation (C) 2004 Fredrik Roubert
 487
 488 //
 489 // BASE64 decoding table
 490 //
 491 static const unsigned char utf7unb64[] =
 492 {
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 497     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 498     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 499     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 500     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 501     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 502     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 503     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 504     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 505     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 506     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 507     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 508     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 523     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 524     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 525 };
 526
 527 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 528 {
 529     size_t len = 0;
 530
 531     while ( *psz && (!buf || (len < n)) )
 532     {
 533         unsigned char cc = *psz++;
 534         if (cc != '+')
 535         {
 536             // plain ASCII char
 537             if (buf)
 538                 *buf++ = cc;
 539             len++;
 540         }
 541         else if (*psz == '-')
 542         {
 543             // encoded plus sign
 544             if (buf)
 545                 *buf++ = cc;
 546             len++;
 547             psz++;
 548         }
 549         else // start of BASE64 encoded string
 550         {
 551             bool lsb, ok;
 552             unsigned int d, l;
 553             for ( ok = lsb = false, d = 0, l = 0;
 554                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 555                   psz++ )
 556             {
 557                 d <<= 6;
 558                 d += cc;
 559                 for (l += 6; l >= 8; lsb = !lsb)
 560                 {
 561                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 562                     if (lsb)
 563                     {
 564                         if (buf)
 565                             *buf++ |= c;
 566                         len ++;
 567                     }
 568                     else
 569                     {
 570                         if (buf)
 571                             *buf = (wchar_t)(c << 8);
 572                     }
 573
 574                     ok = true;
 575                 }
 576             }
 577
 578             if ( !ok )
 579             {
 580                 // in valid UTF7 we should have valid characters after '+'
 581                 return wxCONV_FAILED;
 582             }
 583
 584             if (*psz == '-')
 585                 psz++;
 586         }
 587     }
 588
 589     if ( buf && (len < n) )
 590         *buf = '\0';
 591
 592     return len;
 593 }
 594
 595 //
 596 // BASE64 encoding table
 597 //
 598 static const unsigned char utf7enb64[] =
 599 {
 600     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 601     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 602     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 603     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 604     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 605     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 606     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 607     '4', '5', '6', '7', '8', '9', '+', '/'
 608 };
 609
 610 //
 611 // UTF-7 encoding table
 612 //
 613 // 0 - Set D (directly encoded characters)
 614 // 1 - Set O (optional direct characters)
 615 // 2 - whitespace characters (optional)
 616 // 3 - special characters
 617 //
 618 static const unsigned char utf7encode[128] =
 619 {
 620     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 621     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 622     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 623     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 624     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 625     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 626     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 627     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 628 };
 629
 630 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 631 {
 632     size_t len = 0;
 633
 634     while (*psz && ((!buf) || (len < n)))
 635     {
 636         wchar_t cc = *psz++;
 637         if (cc < 0x80 && utf7encode[cc] < 1)
 638         {
 639             // plain ASCII char
 640             if (buf)
 641                 *buf++ = (char)cc;
 642
 643             len++;
 644         }
 645 #ifndef WC_UTF16
 646         else if (((wxUint32)cc) > 0xffff)
 647         {
 648             // no surrogate pair generation (yet?)
 649             return wxCONV_FAILED;
 650         }
 651 #endif
 652         else
 653         {
 654             if (buf)
 655                 *buf++ = '+';
 656
 657             len++;
 658             if (cc != '+')
 659             {
 660                 // BASE64 encode string
 661                 unsigned int lsb, d, l;
 662                 for (d = 0, l = 0; /*nothing*/; psz++)
 663                 {
 664                     for (lsb = 0; lsb < 2; lsb ++)
 665                     {
 666                         d <<= 8;
 667                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 668
 669                         for (l += 8; l >= 6; )
 670                         {
 671                             l -= 6;
 672                             if (buf)
 673                                 *buf++ = utf7enb64[(d >> l) % 64];
 674                             len++;
 675                         }
 676                     }
 677
 678                     cc = *psz;
 679                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 680                         break;
 681                 }
 682
 683                 if (l != 0)
 684                 {
 685                     if (buf)
 686                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 687
 688                     len++;
 689                 }
 690             }
 691
 692             if (buf)
 693                 *buf++ = '-';
 694             len++;
 695         }
 696     }
 697
 698     if (buf && (len < n))
 699         *buf = 0;
 700
 701     return len;
 702 }
 703
 704 // ----------------------------------------------------------------------------
 705 // UTF-8
 706 // ----------------------------------------------------------------------------
 707
 708 static const wxUint32 utf8_max[]=
 709     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 710
 711 // boundaries of the private use area we use to (temporarily) remap invalid
 712 // characters invalid in a UTF-8 encoded string
 713 const wxUint32 wxUnicodePUA = 0x100000;
 714 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 715
 716 // this table gives the length of the UTF-8 encoding from its first character:
 717 const unsigned char tableUtf8Lengths[256] = {
 718     // single-byte sequences (ASCII):
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 725     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 726     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 727
 728     // these are invalid:
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 731     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 732     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 733     0, 0,                                            // C0,C1
 734
 735     // two-byte sequences:
 736           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 737     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 738
 739     // three-byte sequences:
 740     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 741
 742     // four-byte sequences:
 743     4, 4, 4, 4, 4,                                   // F0..F4
 744
 745     // these are invalid again (5- or 6-byte
 746     // sequences and sequences for code points
 747     // above U+10FFFF, as restricted by RFC 3629):
 748                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 749 };
 750
 751 size_t
 752 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 753                             const char *src, size_t srcLen) const
 754 {
 755     wchar_t *out = dstLen ? dst : NULL;
 756     size_t written = 0;
 757
 758     if ( srcLen == wxNO_LEN )
 759         srcLen = strlen(src) + 1;
 760
 761     for ( const char *p = src; ; p++ )
 762     {
 763         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 764         {
 765             // all done successfully, just add the trailing NULL if we are not
 766             // using explicit length
 767             if ( srcLen == wxNO_LEN )
 768             {
 769                 if ( out )
 770                 {
 771                     if ( !dstLen )
 772                         break;
 773
 774                     *out = L'\0';
 775                 }
 776
 777                 written++;
 778             }
 779
 780             return written;
 781         }
 782
 783         if ( out && !dstLen-- )
 784             break;
 785
 786         wxUint32 code;
 787         unsigned char c = *p;
 788
 789         if ( c < 0x80 )
 790         {
 791             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 792                 break;
 793
 794             if ( srcLen != wxNO_LEN )
 795                 srcLen--;
 796
 797             code = c;
 798         }
 799         else
 800         {
 801             unsigned len = tableUtf8Lengths[c];
 802             if ( !len )
 803                 break;
 804
 805             if ( srcLen < len ) // the test works for wxNO_LEN too
 806                 break;
 807
 808             if ( srcLen != wxNO_LEN )
 809                 srcLen -= len;
 810
 811             //   Char. number range   |        UTF-8 octet sequence
 812             //      (hexadecimal)     |              (binary)
 813             //  ----------------------+----------------------------------------
 814             //  0000 0000 - 0000 007F | 0xxxxxxx
 815             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 816             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 817             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 818             //
 819             //  Code point value is stored in bits marked with 'x',
 820             //  lowest-order bit of the value on the right side in the diagram
 821             //  above.                                         (from RFC 3629)
 822
 823             // mask to extract lead byte's value ('x' bits above), by sequence
 824             // length:
 825             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 826
 827             // mask and value of lead byte's most significant bits, by length:
 828             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 829             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 830
 831             len--; // it's more convenient to work with 0-based length here
 832
 833             // extract the lead byte's value bits:
 834             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 835                 break;
 836
 837             code = c & leadValueMask[len];
 838
 839             // all remaining bytes, if any, are handled in the same way
 840             // regardless of sequence's length:
 841             for ( ; len; --len )
 842             {
 843                 c = *++p;
 844                 if ( (c & 0xC0) != 0x80 )
 845                     return wxCONV_FAILED;
 846
 847                 code <<= 6;
 848                 code |= c & 0x3F;
 849             }
 850         }
 851
 852 #ifdef WC_UTF16
 853         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 854         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 855         {
 856             if ( out )
 857                 out++;
 858             written++;
 859         }
 860 #else // !WC_UTF16
 861         if ( out )
 862             *out = code;
 863 #endif // WC_UTF16/!WC_UTF16
 864
 865         if ( out )
 866             out++;
 867
 868         written++;
 869     }
 870
 871     return wxCONV_FAILED;
 872 }
 873
 874 size_t
 875 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 876                               const wchar_t *src, size_t srcLen) const
 877 {
 878     char *out = dstLen ? dst : NULL;
 879     size_t written = 0;
 880
 881     for ( const wchar_t *wp = src; ; wp++ )
 882     {
 883         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 884         {
 885             // all done successfully, just add the trailing NULL if we are not
 886             // using explicit length
 887             if ( srcLen == wxNO_LEN )
 888             {
 889                 if ( out )
 890                 {
 891                     if ( !dstLen )
 892                         break;
 893
 894                     *out = '\0';
 895                 }
 896
 897                 written++;
 898             }
 899
 900             return written;
 901         }
 902
 903
 904         wxUint32 code;
 905 #ifdef WC_UTF16
 906         // cast is ok for WC_UTF16
 907         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 908         {
 909             // skip the next char too as we decoded a surrogate
 910             wp++;
 911         }
 912 #else // wchar_t is UTF-32
 913         code = *wp & 0x7fffffff;
 914 #endif
 915
 916         unsigned len;
 917         if ( code <= 0x7F )
 918         {
 919             len = 1;
 920             if ( out )
 921             {
 922                 if ( dstLen < len )
 923                     break;
 924
 925                 out[0] = (char)code;
 926             }
 927         }
 928         else if ( code <= 0x07FF )
 929         {
 930             len = 2;
 931             if ( out )
 932             {
 933                 if ( dstLen < len )
 934                     break;
 935
 936                 // NB: this line takes 6 least significant bits, encodes them as
 937                 // 10xxxxxx and discards them so that the next byte can be encoded:
 938                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 939                 out[0] = 0xC0 | code;
 940             }
 941         }
 942         else if ( code < 0xFFFF )
 943         {
 944             len = 3;
 945             if ( out )
 946             {
 947                 if ( dstLen < len )
 948                     break;
 949
 950                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 951                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 952                 out[0] = 0xE0 | code;
 953             }
 954         }
 955         else if ( code <= 0x10FFFF )
 956         {
 957             len = 4;
 958             if ( out )
 959             {
 960                 if ( dstLen < len )
 961                     break;
 962
 963                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 964                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 965                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 966                 out[0] = 0xF0 | code;
 967             }
 968         }
 969         else
 970         {
 971             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 972             break;
 973         }
 974
 975         if ( out )
 976         {
 977             out += len;
 978             dstLen -= len;
 979         }
 980
 981         written += len;
 982     }
 983
 984     // we only get here if an error occurs during decoding
 985     return wxCONV_FAILED;
 986 }
 987
 988 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
 989                              const char *psz, size_t srcLen) const
 990 {
 991     if ( m_options == MAP_INVALID_UTF8_NOT )
 992         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
 993
 994     size_t len = 0;
 995
 996     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
 997     {
 998         const char *opsz = psz;
 999         bool invalid = false;
1000         unsigned char cc = *psz++, fc = cc;
1001         unsigned cnt;
1002         for (cnt = 0; fc & 0x80; cnt++)
1003             fc <<= 1;
1004
1005         if (!cnt)
1006         {
1007             // plain ASCII char
1008             if (buf)
1009                 *buf++ = cc;
1010             len++;
1011
1012             // escape the escape character for octal escapes
1013             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1014                     && cc == '\\' && (!buf || len < n))
1015             {
1016                 if (buf)
1017                     *buf++ = cc;
1018                 len++;
1019             }
1020         }
1021         else
1022         {
1023             cnt--;
1024             if (!cnt)
1025             {
1026                 // invalid UTF-8 sequence
1027                 invalid = true;
1028             }
1029             else
1030             {
1031                 unsigned ocnt = cnt - 1;
1032                 wxUint32 res = cc & (0x3f >> cnt);
1033                 while (cnt--)
1034                 {
1035                     cc = *psz;
1036                     if ((cc & 0xC0) != 0x80)
1037                     {
1038                         // invalid UTF-8 sequence
1039                         invalid = true;
1040                         break;
1041                     }
1042
1043                     psz++;
1044                     res = (res << 6) | (cc & 0x3f);
1045                 }
1046
1047                 if (invalid || res <= utf8_max[ocnt])
1048                 {
1049                     // illegal UTF-8 encoding
1050                     invalid = true;
1051                 }
1052                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1053                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1054                 {
1055                     // if one of our PUA characters turns up externally
1056                     // it must also be treated as an illegal sequence
1057                     // (a bit like you have to escape an escape character)
1058                     invalid = true;
1059                 }
1060                 else
1061                 {
1062 #ifdef WC_UTF16
1063                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1064                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1065                     if (pa == wxCONV_FAILED)
1066                     {
1067                         invalid = true;
1068                     }
1069                     else
1070                     {
1071                         if (buf)
1072                             buf += pa;
1073                         len += pa;
1074                     }
1075 #else // !WC_UTF16
1076                     if (buf)
1077                         *buf++ = (wchar_t)res;
1078                     len++;
1079 #endif // WC_UTF16/!WC_UTF16
1080                 }
1081             }
1082
1083             if (invalid)
1084             {
1085                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1086                 {
1087                     while (opsz < psz && (!buf || len < n))
1088                     {
1089 #ifdef WC_UTF16
1090                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1091                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1092                         wxASSERT(pa != wxCONV_FAILED);
1093                         if (buf)
1094                             buf += pa;
1095                         opsz++;
1096                         len += pa;
1097 #else
1098                         if (buf)
1099                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1100                         opsz++;
1101                         len++;
1102 #endif
1103                     }
1104                 }
1105                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1106                 {
1107                     while (opsz < psz && (!buf || len < n))
1108                     {
1109                         if ( buf && len + 3 < n )
1110                         {
1111                             unsigned char on = *opsz;
1112                             *buf++ = L'\\';
1113                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1114                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1115                             *buf++ = (wchar_t)( L'0' + on % 010 );
1116                         }
1117
1118                         opsz++;
1119                         len += 4;
1120                     }
1121                 }
1122                 else // MAP_INVALID_UTF8_NOT
1123                 {
1124                     return wxCONV_FAILED;
1125                 }
1126             }
1127         }
1128     }
1129
1130     if (srcLen == wxNO_LEN && buf && (len < n))
1131         *buf = 0;
1132
1133     return len + 1;
1134 }
1135
1136 static inline bool isoctal(wchar_t wch)
1137 {
1138     return L'0' <= wch && wch <= L'7';
1139 }
1140
1141 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1142                                const wchar_t *psz, size_t srcLen) const
1143 {
1144     if ( m_options == MAP_INVALID_UTF8_NOT )
1145         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1146
1147     size_t len = 0;
1148
1149     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1150     {
1151         wxUint32 cc;
1152
1153 #ifdef WC_UTF16
1154         // cast is ok for WC_UTF16
1155         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1156         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1157 #else
1158         cc = (*psz++) & 0x7fffffff;
1159 #endif
1160
1161         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1162                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1163         {
1164             if (buf)
1165                 *buf++ = (char)(cc - wxUnicodePUA);
1166             len++;
1167         }
1168         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1169                     && cc == L'\\' && psz[0] == L'\\' )
1170         {
1171             if (buf)
1172                 *buf++ = (char)cc;
1173             psz++;
1174             len++;
1175         }
1176         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1177                     cc == L'\\' &&
1178                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1179         {
1180             if (buf)
1181             {
1182                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1183                                  (psz[1] - L'0') * 010 +
1184                                  (psz[2] - L'0'));
1185             }
1186
1187             psz += 3;
1188             len++;
1189         }
1190         else
1191         {
1192             unsigned cnt;
1193             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1194             {
1195             }
1196
1197             if (!cnt)
1198             {
1199                 // plain ASCII char
1200                 if (buf)
1201                     *buf++ = (char) cc;
1202                 len++;
1203             }
1204             else
1205             {
1206                 len += cnt + 1;
1207                 if (buf)
1208                 {
1209                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1210                     while (cnt--)
1211                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1212                 }
1213             }
1214         }
1215     }
1216
1217     if (srcLen == wxNO_LEN && buf && (len < n))
1218         *buf = 0;
1219
1220     return len + 1;
1221 }
1222
1223 // ============================================================================
1224 // UTF-16
1225 // ============================================================================
1226
1227 #ifdef WORDS_BIGENDIAN
1228     #define wxMBConvUTF16straight wxMBConvUTF16BE
1229     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1230 #else
1231     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1232     #define wxMBConvUTF16straight wxMBConvUTF16LE
1233 #endif
1234
1235 /* static */
1236 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1237 {
1238     if ( srcLen == wxNO_LEN )
1239     {
1240         // count the number of bytes in input, including the trailing NULs
1241         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1242         for ( srcLen = 1; *inBuff++; srcLen++ )
1243             ;
1244
1245         srcLen *= BYTES_PER_CHAR;
1246     }
1247     else // we already have the length
1248     {
1249         // we can only convert an entire number of UTF-16 characters
1250         if ( srcLen % BYTES_PER_CHAR )
1251             return wxCONV_FAILED;
1252     }
1253
1254     return srcLen;
1255 }
1256
1257 // case when in-memory representation is UTF-16 too
1258 #ifdef WC_UTF16
1259
1260 // ----------------------------------------------------------------------------
1261 // conversions without endianness change
1262 // ----------------------------------------------------------------------------
1263
1264 size_t
1265 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1266                                const char *src, size_t srcLen) const
1267 {
1268     // set up the scene for using memcpy() (which is presumably more efficient
1269     // than copying the bytes one by one)
1270     srcLen = GetLength(src, srcLen);
1271     if ( srcLen == wxNO_LEN )
1272         return wxCONV_FAILED;
1273
1274     const size_t inLen = srcLen / BYTES_PER_CHAR;
1275     if ( dst )
1276     {
1277         if ( dstLen < inLen )
1278             return wxCONV_FAILED;
1279
1280         memcpy(dst, src, srcLen);
1281     }
1282
1283     return inLen;
1284 }
1285
1286 size_t
1287 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1288                                  const wchar_t *src, size_t srcLen) const
1289 {
1290     if ( srcLen == wxNO_LEN )
1291         srcLen = wxWcslen(src) + 1;
1292
1293     srcLen *= BYTES_PER_CHAR;
1294
1295     if ( dst )
1296     {
1297         if ( dstLen < srcLen )
1298             return wxCONV_FAILED;
1299
1300         memcpy(dst, src, srcLen);
1301     }
1302
1303     return srcLen;
1304 }
1305
1306 // ----------------------------------------------------------------------------
1307 // endian-reversing conversions
1308 // ----------------------------------------------------------------------------
1309
1310 size_t
1311 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1312                            const char *src, size_t srcLen) const
1313 {
1314     srcLen = GetLength(src, srcLen);
1315     if ( srcLen == wxNO_LEN )
1316         return wxCONV_FAILED;
1317
1318     srcLen /= BYTES_PER_CHAR;
1319
1320     if ( dst )
1321     {
1322         if ( dstLen < srcLen )
1323             return wxCONV_FAILED;
1324
1325         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1326         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1327         {
1328             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1329         }
1330     }
1331
1332     return srcLen;
1333 }
1334
1335 size_t
1336 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1337                              const wchar_t *src, size_t srcLen) const
1338 {
1339     if ( srcLen == wxNO_LEN )
1340         srcLen = wxWcslen(src) + 1;
1341
1342     srcLen *= BYTES_PER_CHAR;
1343
1344     if ( dst )
1345     {
1346         if ( dstLen < srcLen )
1347             return wxCONV_FAILED;
1348
1349         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1350         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1351         {
1352             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1353         }
1354     }
1355
1356     return srcLen;
1357 }
1358
1359 #else // !WC_UTF16: wchar_t is UTF-32
1360
1361 // ----------------------------------------------------------------------------
1362 // conversions without endianness change
1363 // ----------------------------------------------------------------------------
1364
1365 size_t
1366 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1367                                const char *src, size_t srcLen) const
1368 {
1369     srcLen = GetLength(src, srcLen);
1370     if ( srcLen == wxNO_LEN )
1371         return wxCONV_FAILED;
1372
1373     const size_t inLen = srcLen / BYTES_PER_CHAR;
1374     if ( !dst )
1375     {
1376         // optimization: return maximal space which could be needed for this
1377         // string even if the real size could be smaller if the buffer contains
1378         // any surrogates
1379         return inLen;
1380     }
1381
1382     size_t outLen = 0;
1383     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1384     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1385     {
1386         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1387         if ( !inBuff )
1388             return wxCONV_FAILED;
1389
1390         if ( ++outLen > dstLen )
1391             return wxCONV_FAILED;
1392
1393         *dst++ = ch;
1394     }
1395
1396
1397     return outLen;
1398 }
1399
1400 size_t
1401 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1402                                  const wchar_t *src, size_t srcLen) const
1403 {
1404     if ( srcLen == wxNO_LEN )
1405         srcLen = wxWcslen(src) + 1;
1406
1407     size_t outLen = 0;
1408     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1409     for ( size_t n = 0; n < srcLen; n++ )
1410     {
1411         wxUint16 cc[2];
1412         const size_t numChars = encode_utf16(*src++, cc);
1413         if ( numChars == wxCONV_FAILED )
1414             return wxCONV_FAILED;
1415
1416         outLen += numChars * BYTES_PER_CHAR;
1417         if ( outBuff )
1418         {
1419             if ( outLen > dstLen )
1420                 return wxCONV_FAILED;
1421
1422             *outBuff++ = cc[0];
1423             if ( numChars == 2 )
1424             {
1425                 // second character of a surrogate
1426                 *outBuff++ = cc[1];
1427             }
1428         }
1429     }
1430
1431     return outLen;
1432 }
1433
1434 // ----------------------------------------------------------------------------
1435 // endian-reversing conversions
1436 // ----------------------------------------------------------------------------
1437
1438 size_t
1439 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1440                            const char *src, size_t srcLen) const
1441 {
1442     srcLen = GetLength(src, srcLen);
1443     if ( srcLen == wxNO_LEN )
1444         return wxCONV_FAILED;
1445
1446     const size_t inLen = srcLen / BYTES_PER_CHAR;
1447     if ( !dst )
1448     {
1449         // optimization: return maximal space which could be needed for this
1450         // string even if the real size could be smaller if the buffer contains
1451         // any surrogates
1452         return inLen;
1453     }
1454
1455     size_t outLen = 0;
1456     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1457     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1458     {
1459         wxUint32 ch;
1460         wxUint16 tmp[2];
1461
1462         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1463         inBuff++;
1464         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1465
1466         const size_t numChars = decode_utf16(tmp, ch);
1467         if ( numChars == wxCONV_FAILED )
1468             return wxCONV_FAILED;
1469
1470         if ( numChars == 2 )
1471             inBuff++;
1472
1473         if ( ++outLen > dstLen )
1474             return wxCONV_FAILED;
1475
1476         *dst++ = ch;
1477     }
1478
1479
1480     return outLen;
1481 }
1482
1483 size_t
1484 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1485                              const wchar_t *src, size_t srcLen) const
1486 {
1487     if ( srcLen == wxNO_LEN )
1488         srcLen = wxWcslen(src) + 1;
1489
1490     size_t outLen = 0;
1491     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1492     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1493     {
1494         wxUint16 cc[2];
1495         const size_t numChars = encode_utf16(*src, cc);
1496         if ( numChars == wxCONV_FAILED )
1497             return wxCONV_FAILED;
1498
1499         outLen += numChars * BYTES_PER_CHAR;
1500         if ( outBuff )
1501         {
1502             if ( outLen > dstLen )
1503                 return wxCONV_FAILED;
1504
1505             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1506             if ( numChars == 2 )
1507             {
1508                 // second character of a surrogate
1509                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1510             }
1511         }
1512     }
1513
1514     return outLen;
1515 }
1516
1517 #endif // WC_UTF16/!WC_UTF16
1518
1519
1520 // ============================================================================
1521 // UTF-32
1522 // ============================================================================
1523
1524 #ifdef WORDS_BIGENDIAN
1525     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1526     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1527 #else
1528     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1529     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1530 #endif
1531
1532
1533 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1534 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1535
1536 /* static */
1537 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1538 {
1539     if ( srcLen == wxNO_LEN )
1540     {
1541         // count the number of bytes in input, including the trailing NULs
1542         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1543         for ( srcLen = 1; *inBuff++; srcLen++ )
1544             ;
1545
1546         srcLen *= BYTES_PER_CHAR;
1547     }
1548     else // we already have the length
1549     {
1550         // we can only convert an entire number of UTF-32 characters
1551         if ( srcLen % BYTES_PER_CHAR )
1552             return wxCONV_FAILED;
1553     }
1554
1555     return srcLen;
1556 }
1557
1558 // case when in-memory representation is UTF-16
1559 #ifdef WC_UTF16
1560
1561 // ----------------------------------------------------------------------------
1562 // conversions without endianness change
1563 // ----------------------------------------------------------------------------
1564
1565 size_t
1566 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1567                                const char *src, size_t srcLen) const
1568 {
1569     srcLen = GetLength(src, srcLen);
1570     if ( srcLen == wxNO_LEN )
1571         return wxCONV_FAILED;
1572
1573     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1574     const size_t inLen = srcLen / BYTES_PER_CHAR;
1575     size_t outLen = 0;
1576     for ( size_t n = 0; n < inLen; n++ )
1577     {
1578         wxUint16 cc[2];
1579         const size_t numChars = encode_utf16(*inBuff++, cc);
1580         if ( numChars == wxCONV_FAILED )
1581             return wxCONV_FAILED;
1582
1583         outLen += numChars;
1584         if ( dst )
1585         {
1586             if ( outLen > dstLen )
1587                 return wxCONV_FAILED;
1588
1589             *dst++ = cc[0];
1590             if ( numChars == 2 )
1591             {
1592                 // second character of a surrogate
1593                 *dst++ = cc[1];
1594             }
1595         }
1596     }
1597
1598     return outLen;
1599 }
1600
1601 size_t
1602 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1603                                  const wchar_t *src, size_t srcLen) const
1604 {
1605     if ( srcLen == wxNO_LEN )
1606         srcLen = wxWcslen(src) + 1;
1607
1608     if ( !dst )
1609     {
1610         // optimization: return maximal space which could be needed for this
1611         // string instead of the exact amount which could be less if there are
1612         // any surrogates in the input
1613         //
1614         // we consider that surrogates are rare enough to make it worthwhile to
1615         // avoid running the loop below at the cost of slightly extra memory
1616         // consumption
1617         return srcLen * BYTES_PER_CHAR;
1618     }
1619
1620     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1621     size_t outLen = 0;
1622     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1623     {
1624         const wxUint32 ch = wxDecodeSurrogate(&src);
1625         if ( !src )
1626             return wxCONV_FAILED;
1627
1628         outLen += BYTES_PER_CHAR;
1629
1630         if ( outLen > dstLen )
1631             return wxCONV_FAILED;
1632
1633         *outBuff++ = ch;
1634     }
1635
1636     return outLen;
1637 }
1638
1639 // ----------------------------------------------------------------------------
1640 // endian-reversing conversions
1641 // ----------------------------------------------------------------------------
1642
1643 size_t
1644 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1645                            const char *src, size_t srcLen) const
1646 {
1647     srcLen = GetLength(src, srcLen);
1648     if ( srcLen == wxNO_LEN )
1649         return wxCONV_FAILED;
1650
1651     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1652     const size_t inLen = srcLen / BYTES_PER_CHAR;
1653     size_t outLen = 0;
1654     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1655     {
1656         wxUint16 cc[2];
1657         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1658         if ( numChars == wxCONV_FAILED )
1659             return wxCONV_FAILED;
1660
1661         outLen += numChars;
1662         if ( dst )
1663         {
1664             if ( outLen > dstLen )
1665                 return wxCONV_FAILED;
1666
1667             *dst++ = cc[0];
1668             if ( numChars == 2 )
1669             {
1670                 // second character of a surrogate
1671                 *dst++ = cc[1];
1672             }
1673         }
1674     }
1675
1676     return outLen;
1677 }
1678
1679 size_t
1680 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1681                              const wchar_t *src, size_t srcLen) const
1682 {
1683     if ( srcLen == wxNO_LEN )
1684         srcLen = wxWcslen(src) + 1;
1685
1686     if ( !dst )
1687     {
1688         // optimization: return maximal space which could be needed for this
1689         // string instead of the exact amount which could be less if there are
1690         // any surrogates in the input
1691         //
1692         // we consider that surrogates are rare enough to make it worthwhile to
1693         // avoid running the loop below at the cost of slightly extra memory
1694         // consumption
1695         return srcLen*BYTES_PER_CHAR;
1696     }
1697
1698     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1699     size_t outLen = 0;
1700     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1701     {
1702         const wxUint32 ch = wxDecodeSurrogate(&src);
1703         if ( !src )
1704             return wxCONV_FAILED;
1705
1706         outLen += BYTES_PER_CHAR;
1707
1708         if ( outLen > dstLen )
1709             return wxCONV_FAILED;
1710
1711         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1712     }
1713
1714     return outLen;
1715 }
1716
1717 #else // !WC_UTF16: wchar_t is UTF-32
1718
1719 // ----------------------------------------------------------------------------
1720 // conversions without endianness change
1721 // ----------------------------------------------------------------------------
1722
1723 size_t
1724 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1725                                const char *src, size_t srcLen) const
1726 {
1727     // use memcpy() as it should be much faster than hand-written loop
1728     srcLen = GetLength(src, srcLen);
1729     if ( srcLen == wxNO_LEN )
1730         return wxCONV_FAILED;
1731
1732     const size_t inLen = srcLen/BYTES_PER_CHAR;
1733     if ( dst )
1734     {
1735         if ( dstLen < inLen )
1736             return wxCONV_FAILED;
1737
1738         memcpy(dst, src, srcLen);
1739     }
1740
1741     return inLen;
1742 }
1743
1744 size_t
1745 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1746                                  const wchar_t *src, size_t srcLen) const
1747 {
1748     if ( srcLen == wxNO_LEN )
1749         srcLen = wxWcslen(src) + 1;
1750
1751     srcLen *= BYTES_PER_CHAR;
1752
1753     if ( dst )
1754     {
1755         if ( dstLen < srcLen )
1756             return wxCONV_FAILED;
1757
1758         memcpy(dst, src, srcLen);
1759     }
1760
1761     return srcLen;
1762 }
1763
1764 // ----------------------------------------------------------------------------
1765 // endian-reversing conversions
1766 // ----------------------------------------------------------------------------
1767
1768 size_t
1769 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1770                            const char *src, size_t srcLen) const
1771 {
1772     srcLen = GetLength(src, srcLen);
1773     if ( srcLen == wxNO_LEN )
1774         return wxCONV_FAILED;
1775
1776     srcLen /= BYTES_PER_CHAR;
1777
1778     if ( dst )
1779     {
1780         if ( dstLen < srcLen )
1781             return wxCONV_FAILED;
1782
1783         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1784         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1785         {
1786             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1787         }
1788     }
1789
1790     return srcLen;
1791 }
1792
1793 size_t
1794 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1795                              const wchar_t *src, size_t srcLen) const
1796 {
1797     if ( srcLen == wxNO_LEN )
1798         srcLen = wxWcslen(src) + 1;
1799
1800     srcLen *= BYTES_PER_CHAR;
1801
1802     if ( dst )
1803     {
1804         if ( dstLen < srcLen )
1805             return wxCONV_FAILED;
1806
1807         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1808         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1809         {
1810             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1811         }
1812     }
1813
1814     return srcLen;
1815 }
1816
1817 #endif // WC_UTF16/!WC_UTF16
1818
1819
1820 // ============================================================================
1821 // The classes doing conversion using the iconv_xxx() functions
1822 // ============================================================================
1823
1824 #ifdef HAVE_ICONV
1825
1826 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1827 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1828 //     (unless there's yet another bug in glibc) the only case when iconv()
1829 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1830 //     left in the input buffer -- when _real_ error occurs,
1831 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1832 //     iconv() failure.
1833 //     [This bug does not appear in glibc 2.2.]
1834 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1835 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1836                                      (errno != E2BIG || bufLeft != 0))
1837 #else
1838 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1839 #endif
1840
1841 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1842
1843 #define ICONV_T_INVALID ((iconv_t)-1)
1844
1845 #if SIZEOF_WCHAR_T == 4
1846     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1847     #define WC_ENC      wxFONTENCODING_UTF32
1848 #elif SIZEOF_WCHAR_T == 2
1849     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1850     #define WC_ENC      wxFONTENCODING_UTF16
1851 #else // sizeof(wchar_t) != 2 nor 4
1852     // does this ever happen?
1853     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1854 #endif
1855
1856 // ----------------------------------------------------------------------------
1857 // wxMBConv_iconv: encapsulates an iconv character set
1858 // ----------------------------------------------------------------------------
1859
1860 class wxMBConv_iconv : public wxMBConv
1861 {
1862 public:
1863     wxMBConv_iconv(const char *name);
1864     virtual ~wxMBConv_iconv();
1865
1866     // implement base class virtual methods
1867     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
1868                            const char *src, size_t srcLen = wxNO_LEN) const;
1869     virtual size_t FromWChar(char *dst, size_t dstLen,
1870                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
1871     virtual size_t GetMBNulLen() const;
1872
1873 #if wxUSE_UNICODE_UTF8
1874     virtual bool IsUTF8() const;
1875 #endif
1876
1877     virtual wxMBConv *Clone() const
1878     {
1879         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1880         p->m_minMBCharWidth = m_minMBCharWidth;
1881         return p;
1882     }
1883
1884     bool IsOk() const
1885         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1886
1887 protected:
1888     // the iconv handlers used to translate from multibyte
1889     // to wide char and in the other direction
1890     iconv_t m2w,
1891             w2m;
1892
1893 #if wxUSE_THREADS
1894     // guards access to m2w and w2m objects
1895     wxMutex m_iconvMutex;
1896 #endif
1897
1898 private:
1899     // the name (for iconv_open()) of a wide char charset -- if none is
1900     // available on this machine, it will remain NULL
1901     static wxString ms_wcCharsetName;
1902
1903     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1904     // different endian-ness than the native one
1905     static bool ms_wcNeedsSwap;
1906
1907
1908     // name of the encoding handled by this conversion
1909     wxString m_name;
1910
1911     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1912     // initially
1913     size_t m_minMBCharWidth;
1914 };
1915
1916 // make the constructor available for unit testing
1917 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1918 {
1919     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1920     if ( !result->IsOk() )
1921     {
1922         delete result;
1923         return 0;
1924     }
1925
1926     return result;
1927 }
1928
1929 wxString wxMBConv_iconv::ms_wcCharsetName;
1930 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1931
1932 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1933               : m_name(name)
1934 {
1935     m_minMBCharWidth = 0;
1936
1937     // check for charset that represents wchar_t:
1938     if ( ms_wcCharsetName.empty() )
1939     {
1940         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1941
1942 #if wxUSE_FONTMAP
1943         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1944 #else // !wxUSE_FONTMAP
1945         static const wxChar *names_static[] =
1946         {
1947 #if SIZEOF_WCHAR_T == 4
1948             _T("UCS-4"),
1949 #elif SIZEOF_WCHAR_T = 2
1950             _T("UCS-2"),
1951 #endif
1952             NULL
1953         };
1954         const wxChar **names = names_static;
1955 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1956
1957         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1958         {
1959             const wxString nameCS(*names);
1960
1961             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1962             wxString nameXE(nameCS);
1963
1964 #ifdef WORDS_BIGENDIAN
1965                 nameXE += _T("BE");
1966 #else // little endian
1967                 nameXE += _T("LE");
1968 #endif
1969
1970             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1971                        nameXE.c_str());
1972
1973             m2w = iconv_open(nameXE.ToAscii(), name);
1974             if ( m2w == ICONV_T_INVALID )
1975             {
1976                 // try charset w/o bytesex info (e.g. "UCS4")
1977                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1978                            nameCS.c_str());
1979                 m2w = iconv_open(nameCS.ToAscii(), name);
1980
1981                 // and check for bytesex ourselves:
1982                 if ( m2w != ICONV_T_INVALID )
1983                 {
1984                     char    buf[2], *bufPtr;
1985                     wchar_t wbuf[2];
1986                     size_t  insz, outsz;
1987                     size_t  res;
1988
1989                     buf[0] = 'A';
1990                     buf[1] = 0;
1991                     wbuf[0] = 0;
1992                     insz = 2;
1993                     outsz = SIZEOF_WCHAR_T * 2;
1994                     char* wbufPtr = (char*)wbuf;
1995                     bufPtr = buf;
1996
1997                     res = iconv(
1998                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1999                         &wbufPtr, &outsz);
2000
2001                     if (ICONV_FAILED(res, insz))
2002                     {
2003                         wxLogLastError(wxT("iconv"));
2004                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2005                                    nameCS.c_str());
2006                     }
2007                     else // ok, can convert to this encoding, remember it
2008                     {
2009                         ms_wcCharsetName = nameCS;
2010                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2011                     }
2012                 }
2013             }
2014             else // use charset not requiring byte swapping
2015             {
2016                 ms_wcCharsetName = nameXE;
2017             }
2018         }
2019
2020         wxLogTrace(TRACE_STRCONV,
2021                    wxT("iconv wchar_t charset is \"%s\"%s"),
2022                    ms_wcCharsetName.empty() ? wxString("<none>")
2023                                             : ms_wcCharsetName,
2024                    ms_wcNeedsSwap ? _T(" (needs swap)")
2025                                   : _T(""));
2026     }
2027     else // we already have ms_wcCharsetName
2028     {
2029         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2030     }
2031
2032     if ( ms_wcCharsetName.empty() )
2033     {
2034         w2m = ICONV_T_INVALID;
2035     }
2036     else
2037     {
2038         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2039         if ( w2m == ICONV_T_INVALID )
2040         {
2041             wxLogTrace(TRACE_STRCONV,
2042                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2043                        ms_wcCharsetName.c_str(), name);
2044         }
2045     }
2046 }
2047
2048 wxMBConv_iconv::~wxMBConv_iconv()
2049 {
2050     if ( m2w != ICONV_T_INVALID )
2051         iconv_close(m2w);
2052     if ( w2m != ICONV_T_INVALID )
2053         iconv_close(w2m);
2054 }
2055
2056 size_t
2057 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2058                         const char *src, size_t srcLen) const
2059 {
2060     if ( srcLen == wxNO_LEN )
2061     {
2062         // find the string length: notice that must be done differently for
2063         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2064         // consecutive NULs
2065         const size_t nulLen = GetMBNulLen();
2066         switch ( nulLen )
2067         {
2068             default:
2069                 return wxCONV_FAILED;
2070
2071             case 1:
2072                 srcLen = strlen(src); // arguably more optimized than our version
2073                 break;
2074
2075             case 2:
2076             case 4:
2077                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2078                 // but they also have to start at character boundary and not
2079                 // span two adjacent characters
2080                 const char *p;
2081                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2082                     ;
2083                 srcLen = p - src;
2084                 break;
2085         }
2086     }
2087
2088     // we express length in the number of (wide) characters but iconv always
2089     // counts buffer sizes it in bytes
2090     dstLen *= SIZEOF_WCHAR_T;
2091
2092 #if wxUSE_THREADS
2093     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2094     //     Unfortunately there are a couple of global wxCSConv objects such as
2095     //     wxConvLocal that are used all over wx code, so we have to make sure
2096     //     the handle is used by at most one thread at the time. Otherwise
2097     //     only a few wx classes would be safe to use from non-main threads
2098     //     as MB<->WC conversion would fail "randomly".
2099     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2100 #endif // wxUSE_THREADS
2101
2102     size_t res, cres;
2103     const char *pszPtr = src;
2104
2105     if ( dst )
2106     {
2107         char* bufPtr = (char*)dst;
2108
2109         // have destination buffer, convert there
2110         cres = iconv(m2w,
2111                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2112                      &bufPtr, &dstLen);
2113         res = dstLen - (dstLen / SIZEOF_WCHAR_T);
2114
2115         if (ms_wcNeedsSwap)
2116         {
2117             // convert to native endianness
2118             for ( unsigned i = 0; i < res; i++ )
2119                 dst[dstLen] = WC_BSWAP(dst[i]);
2120         }
2121
2122         // NUL-terminate the string if there is any space left
2123         if (res < dstLen)
2124             dst[res] = 0;
2125     }
2126     else // no destination buffer
2127     {
2128         // convert using temp buffer to calculate the size of the buffer needed
2129         wchar_t tbuf[8];
2130         res = 0;
2131
2132         do
2133         {
2134             char* bufPtr = (char*)tbuf;
2135             dstLen = 8 * SIZEOF_WCHAR_T;
2136
2137             cres = iconv(m2w,
2138                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2139                          &bufPtr, &dstLen );
2140
2141             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2142         }
2143         while ((cres == (size_t)-1) && (errno == E2BIG));
2144     }
2145
2146     if (ICONV_FAILED(cres, srcLen))
2147     {
2148         //VS: it is ok if iconv fails, hence trace only
2149         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2150         return wxCONV_FAILED;
2151     }
2152
2153     return res;
2154 }
2155
2156 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2157                                  const wchar_t *src, size_t srcLen) const
2158 {
2159 #if wxUSE_THREADS
2160     // NB: explained in MB2WC
2161     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2162 #endif
2163
2164     if ( srcLen == wxNO_LEN )
2165         srcLen = wxWcslen(src);
2166
2167     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2168     size_t outbuflen = dstLen;
2169     size_t res, cres;
2170
2171     wchar_t *tmpbuf = 0;
2172
2173     if (ms_wcNeedsSwap)
2174     {
2175         // need to copy to temp buffer to switch endianness
2176         // (doing WC_BSWAP twice on the original buffer won't help, as it
2177         //  could be in read-only memory, or be accessed in some other thread)
2178         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2179         for ( size_t i = 0; i < srcLen; i++ )
2180             tmpbuf[i] = WC_BSWAP(src[i]);
2181
2182         tmpbuf[srcLen] = L'\0';
2183         src = tmpbuf;
2184     }
2185
2186     char* inbuf = (char*)src;
2187     if ( dst )
2188     {
2189         // have destination buffer, convert there
2190         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2191
2192         res = dstLen - outbuflen;
2193
2194         // NB: iconv was given only wcslen(src) characters on input, and so
2195         //     it couldn't convert the trailing zero. Let's do it ourselves
2196         //     if there's some room left for it in the output buffer.
2197         if (res < dstLen)
2198             dst[0] = 0;
2199     }
2200     else // no destination buffer
2201     {
2202         // convert using temp buffer to calculate the size of the buffer needed
2203         char tbuf[16];
2204         res = 0;
2205         do
2206         {
2207             dst = tbuf;
2208             outbuflen = 16;
2209
2210             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2211
2212             res += 16 - outbuflen;
2213         }
2214         while ((cres == (size_t)-1) && (errno == E2BIG));
2215     }
2216
2217     if (ms_wcNeedsSwap)
2218     {
2219         free(tmpbuf);
2220     }
2221
2222     if (ICONV_FAILED(cres, inbuflen))
2223     {
2224         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2225         return wxCONV_FAILED;
2226     }
2227
2228     return res;
2229 }
2230
2231 size_t wxMBConv_iconv::GetMBNulLen() const
2232 {
2233     if ( m_minMBCharWidth == 0 )
2234     {
2235         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2236
2237 #if wxUSE_THREADS
2238         // NB: explained in MB2WC
2239         wxMutexLocker lock(self->m_iconvMutex);
2240 #endif
2241
2242         const wchar_t *wnul = L"";
2243         char buf[8]; // should be enough for NUL in any encoding
2244         size_t inLen = sizeof(wchar_t),
2245                outLen = WXSIZEOF(buf);
2246         char *inBuff = (char *)wnul;
2247         char *outBuff = buf;
2248         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2249         {
2250             self->m_minMBCharWidth = (size_t)-1;
2251         }
2252         else // ok
2253         {
2254             self->m_minMBCharWidth = outBuff - buf;
2255         }
2256     }
2257
2258     return m_minMBCharWidth;
2259 }
2260
2261 #if wxUSE_UNICODE_UTF8
2262 bool wxMBConv_iconv::IsUTF8() const
2263 {
2264     return wxStricmp(m_name, "UTF-8") == 0 ||
2265            wxStricmp(m_name, "UTF8") == 0;
2266 }
2267 #endif
2268
2269 #endif // HAVE_ICONV
2270
2271
2272 // ============================================================================
2273 // Win32 conversion classes
2274 // ============================================================================
2275
2276 #ifdef wxHAVE_WIN32_MB2WC
2277
2278 // from utils.cpp
2279 #if wxUSE_FONTMAP
2280 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2281 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2282 #endif
2283
2284 class wxMBConv_win32 : public wxMBConv
2285 {
2286 public:
2287     wxMBConv_win32()
2288     {
2289         m_CodePage = CP_ACP;
2290         m_minMBCharWidth = 0;
2291     }
2292
2293     wxMBConv_win32(const wxMBConv_win32& conv)
2294         : wxMBConv()
2295     {
2296         m_CodePage = conv.m_CodePage;
2297         m_minMBCharWidth = conv.m_minMBCharWidth;
2298     }
2299
2300 #if wxUSE_FONTMAP
2301     wxMBConv_win32(const char* name)
2302     {
2303         m_CodePage = wxCharsetToCodepage(name);
2304         m_minMBCharWidth = 0;
2305     }
2306
2307     wxMBConv_win32(wxFontEncoding encoding)
2308     {
2309         m_CodePage = wxEncodingToCodepage(encoding);
2310         m_minMBCharWidth = 0;
2311     }
2312 #endif // wxUSE_FONTMAP
2313
2314     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2315     {
2316         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2317         // the behaviour is not compatible with the Unix version (using iconv)
2318         // and break the library itself, e.g. wxTextInputStream::NextChar()
2319         // wouldn't work if reading an incomplete MB char didn't result in an
2320         // error
2321         //
2322         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2323         // Win XP or newer and it is not supported for UTF-[78] so we always
2324         // use our own conversions in this case. See
2325         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2326         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2327         if ( m_CodePage == CP_UTF8 )
2328         {
2329             return wxMBConvUTF8().MB2WC(buf, psz, n);
2330         }
2331
2332         if ( m_CodePage == CP_UTF7 )
2333         {
2334             return wxMBConvUTF7().MB2WC(buf, psz, n);
2335         }
2336
2337         int flags = 0;
2338         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2339                 IsAtLeastWin2kSP4() )
2340         {
2341             flags = MB_ERR_INVALID_CHARS;
2342         }
2343
2344         const size_t len = ::MultiByteToWideChar
2345                              (
2346                                 m_CodePage,     // code page
2347                                 flags,          // flags: fall on error
2348                                 psz,            // input string
2349                                 -1,             // its length (NUL-terminated)
2350                                 buf,            // output string
2351                                 buf ? n : 0     // size of output buffer
2352                              );
2353         if ( !len )
2354         {
2355             // function totally failed
2356             return wxCONV_FAILED;
2357         }
2358
2359         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2360         // check if we succeeded, by doing a double trip:
2361         if ( !flags && buf )
2362         {
2363             const size_t mbLen = strlen(psz);
2364             wxCharBuffer mbBuf(mbLen);
2365             if ( ::WideCharToMultiByte
2366                    (
2367                       m_CodePage,
2368                       0,
2369                       buf,
2370                       -1,
2371                       mbBuf.data(),
2372                       mbLen + 1,        // size in bytes, not length
2373                       NULL,
2374                       NULL
2375                    ) == 0 ||
2376                   strcmp(mbBuf, psz) != 0 )
2377             {
2378                 // we didn't obtain the same thing we started from, hence
2379                 // the conversion was lossy and we consider that it failed
2380                 return wxCONV_FAILED;
2381             }
2382         }
2383
2384         // note that it returns count of written chars for buf != NULL and size
2385         // of the needed buffer for buf == NULL so in either case the length of
2386         // the string (which never includes the terminating NUL) is one less
2387         return len - 1;
2388     }
2389
2390     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2391     {
2392         /*
2393             we have a problem here: by default, WideCharToMultiByte() may
2394             replace characters unrepresentable in the target code page with bad
2395             quality approximations such as turning "1/2" symbol (U+00BD) into
2396             "1" for the code pages which don't have it and we, obviously, want
2397             to avoid this at any price
2398
2399             the trouble is that this function does it _silently_, i.e. it won't
2400             even tell us whether it did or not... Win98/2000 and higher provide
2401             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2402             we have to resort to a round trip, i.e. check that converting back
2403             results in the same string -- this is, of course, expensive but
2404             otherwise we simply can't be sure to not garble the data.
2405          */
2406
2407         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2408         // it doesn't work with CJK encodings (which we test for rather roughly
2409         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2410         // supporting it
2411         BOOL usedDef wxDUMMY_INITIALIZE(false);
2412         BOOL *pUsedDef;
2413         int flags;
2414         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2415         {
2416             // it's our lucky day
2417             flags = WC_NO_BEST_FIT_CHARS;
2418             pUsedDef = &usedDef;
2419         }
2420         else // old system or unsupported encoding
2421         {
2422             flags = 0;
2423             pUsedDef = NULL;
2424         }
2425
2426         const size_t len = ::WideCharToMultiByte
2427                              (
2428                                 m_CodePage,     // code page
2429                                 flags,          // either none or no best fit
2430                                 pwz,            // input string
2431                                 -1,             // it is (wide) NUL-terminated
2432                                 buf,            // output buffer
2433                                 buf ? n : 0,    // and its size
2434                                 NULL,           // default "replacement" char
2435                                 pUsedDef        // [out] was it used?
2436                              );
2437
2438         if ( !len )
2439         {
2440             // function totally failed
2441             return wxCONV_FAILED;
2442         }
2443
2444         // we did something, check if we really succeeded
2445         if ( flags )
2446         {
2447             // check if the conversion failed, i.e. if any replacements
2448             // were done
2449             if ( usedDef )
2450                 return wxCONV_FAILED;
2451         }
2452         else // we must resort to double tripping...
2453         {
2454             // first we need to ensure that we really have the MB data: this is
2455             // not the case if we're called with NULL buffer, in which case we
2456             // need to do the conversion yet again
2457             wxCharBuffer bufDef;
2458             if ( !buf )
2459             {
2460                 bufDef = wxCharBuffer(len);
2461                 buf = bufDef.data();
2462                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2463                                             buf, len, NULL, NULL) )
2464                     return wxCONV_FAILED;
2465             }
2466
2467             if ( !n )
2468                 n = wcslen(pwz);
2469             wxWCharBuffer wcBuf(n);
2470             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2471                     wcscmp(wcBuf, pwz) != 0 )
2472             {
2473                 // we didn't obtain the same thing we started from, hence
2474                 // the conversion was lossy and we consider that it failed
2475                 return wxCONV_FAILED;
2476             }
2477         }
2478
2479         // see the comment above for the reason of "len - 1"
2480         return len - 1;
2481     }
2482
2483     virtual size_t GetMBNulLen() const
2484     {
2485         if ( m_minMBCharWidth == 0 )
2486         {
2487             int len = ::WideCharToMultiByte
2488                         (
2489                             m_CodePage,     // code page
2490                             0,              // no flags
2491                             L"",            // input string
2492                             1,              // translate just the NUL
2493                             NULL,           // output buffer
2494                             0,              // and its size
2495                             NULL,           // no replacement char
2496                             NULL            // [out] don't care if it was used
2497                         );
2498
2499             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2500             switch ( len )
2501             {
2502                 default:
2503                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2504                     self->m_minMBCharWidth = (size_t)-1;
2505                     break;
2506
2507                 case 0:
2508                     self->m_minMBCharWidth = (size_t)-1;
2509                     break;
2510
2511                 case 1:
2512                 case 2:
2513                 case 4:
2514                     self->m_minMBCharWidth = len;
2515                     break;
2516             }
2517         }
2518
2519         return m_minMBCharWidth;
2520     }
2521
2522     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2523
2524     bool IsOk() const { return m_CodePage != -1; }
2525
2526 private:
2527     static bool CanUseNoBestFit()
2528     {
2529         static int s_isWin98Or2k = -1;
2530
2531         if ( s_isWin98Or2k == -1 )
2532         {
2533             int verMaj, verMin;
2534             switch ( wxGetOsVersion(&verMaj, &verMin) )
2535             {
2536                 case wxOS_WINDOWS_9X:
2537                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2538                     break;
2539
2540                 case wxOS_WINDOWS_NT:
2541                     s_isWin98Or2k = verMaj >= 5;
2542                     break;
2543
2544                 default:
2545                     // unknown: be conservative by default
2546                     s_isWin98Or2k = 0;
2547                     break;
2548             }
2549
2550             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2551         }
2552
2553         return s_isWin98Or2k == 1;
2554     }
2555
2556     static bool IsAtLeastWin2kSP4()
2557     {
2558 #ifdef __WXWINCE__
2559         return false;
2560 #else
2561         static int s_isAtLeastWin2kSP4 = -1;
2562
2563         if ( s_isAtLeastWin2kSP4 == -1 )
2564         {
2565             OSVERSIONINFOEX ver;
2566
2567             memset(&ver, 0, sizeof(ver));
2568             ver.dwOSVersionInfoSize = sizeof(ver);
2569             GetVersionEx((OSVERSIONINFO*)&ver);
2570
2571             s_isAtLeastWin2kSP4 =
2572               ((ver.dwMajorVersion > 5) || // Vista+
2573                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2574                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2575                ver.wServicePackMajor >= 4)) // 2000 SP4+
2576               ? 1 : 0;
2577         }
2578
2579         return s_isAtLeastWin2kSP4 == 1;
2580 #endif
2581     }
2582
2583
2584     // the code page we're working with
2585     long m_CodePage;
2586
2587     // cached result of GetMBNulLen(), set to 0 initially meaning
2588     // "unknown"
2589     size_t m_minMBCharWidth;
2590 };
2591
2592 #endif // wxHAVE_WIN32_MB2WC
2593
2594
2595 // ============================================================================
2596 // wxEncodingConverter based conversion classes
2597 // ============================================================================
2598
2599 #if wxUSE_FONTMAP
2600
2601 class wxMBConv_wxwin : public wxMBConv
2602 {
2603 private:
2604     void Init()
2605     {
2606         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2607         // The wxMBConv_cf class does a better job.
2608         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2609                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2610                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2611     }
2612
2613 public:
2614     // temporarily just use wxEncodingConverter stuff,
2615     // so that it works while a better implementation is built
2616     wxMBConv_wxwin(const char* name)
2617     {
2618         if (name)
2619             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2620         else
2621             m_enc = wxFONTENCODING_SYSTEM;
2622
2623         Init();
2624     }
2625
2626     wxMBConv_wxwin(wxFontEncoding enc)
2627     {
2628         m_enc = enc;
2629
2630         Init();
2631     }
2632
2633     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2634     {
2635         size_t inbuf = strlen(psz);
2636         if (buf)
2637         {
2638             if (!m2w.Convert(psz, buf))
2639                 return wxCONV_FAILED;
2640         }
2641         return inbuf;
2642     }
2643
2644     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2645     {
2646         const size_t inbuf = wxWcslen(psz);
2647         if (buf)
2648         {
2649             if (!w2m.Convert(psz, buf))
2650                 return wxCONV_FAILED;
2651         }
2652
2653         return inbuf;
2654     }
2655
2656     virtual size_t GetMBNulLen() const
2657     {
2658         switch ( m_enc )
2659         {
2660             case wxFONTENCODING_UTF16BE:
2661             case wxFONTENCODING_UTF16LE:
2662                 return 2;
2663
2664             case wxFONTENCODING_UTF32BE:
2665             case wxFONTENCODING_UTF32LE:
2666                 return 4;
2667
2668             default:
2669                 return 1;
2670         }
2671     }
2672
2673     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2674
2675     bool IsOk() const { return m_ok; }
2676
2677 public:
2678     wxFontEncoding m_enc;
2679     wxEncodingConverter m2w, w2m;
2680
2681 private:
2682     // were we initialized successfully?
2683     bool m_ok;
2684
2685     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2686 };
2687
2688 // make the constructors available for unit testing
2689 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2690 {
2691     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2692     if ( !result->IsOk() )
2693     {
2694         delete result;
2695         return 0;
2696     }
2697
2698     return result;
2699 }
2700
2701 #endif // wxUSE_FONTMAP
2702
2703 // ============================================================================
2704 // wxCSConv implementation
2705 // ============================================================================
2706
2707 void wxCSConv::Init()
2708 {
2709     m_name = NULL;
2710     m_convReal =  NULL;
2711     m_deferred = true;
2712 }
2713
2714 wxCSConv::wxCSConv(const wxString& charset)
2715 {
2716     Init();
2717
2718     if ( !charset.empty() )
2719     {
2720         SetName(charset.ToAscii());
2721     }
2722
2723 #if wxUSE_FONTMAP
2724     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2725 #else
2726     m_encoding = wxFONTENCODING_SYSTEM;
2727 #endif
2728 }
2729
2730 wxCSConv::wxCSConv(wxFontEncoding encoding)
2731 {
2732     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2733     {
2734         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2735
2736         encoding = wxFONTENCODING_SYSTEM;
2737     }
2738
2739     Init();
2740
2741     m_encoding = encoding;
2742 }
2743
2744 wxCSConv::~wxCSConv()
2745 {
2746     Clear();
2747 }
2748
2749 wxCSConv::wxCSConv(const wxCSConv& conv)
2750         : wxMBConv()
2751 {
2752     Init();
2753
2754     SetName(conv.m_name);
2755     m_encoding = conv.m_encoding;
2756 }
2757
2758 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2759 {
2760     Clear();
2761
2762     SetName(conv.m_name);
2763     m_encoding = conv.m_encoding;
2764
2765     return *this;
2766 }
2767
2768 void wxCSConv::Clear()
2769 {
2770     free(m_name);
2771     delete m_convReal;
2772
2773     m_name = NULL;
2774     m_convReal = NULL;
2775 }
2776
2777 void wxCSConv::SetName(const char *charset)
2778 {
2779     if (charset)
2780     {
2781         m_name = wxStrdup(charset);
2782         m_deferred = true;
2783     }
2784 }
2785
2786 #if wxUSE_FONTMAP
2787
2788 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2789                      wxEncodingNameCache );
2790
2791 static wxEncodingNameCache gs_nameCache;
2792 #endif
2793
2794 wxMBConv *wxCSConv::DoCreate() const
2795 {
2796 #if wxUSE_FONTMAP
2797     wxLogTrace(TRACE_STRCONV,
2798                wxT("creating conversion for %s"),
2799                (m_name ? m_name
2800                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2801 #endif // wxUSE_FONTMAP
2802
2803     // check for the special case of ASCII or ISO8859-1 charset: as we have
2804     // special knowledge of it anyhow, we don't need to create a special
2805     // conversion object
2806     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2807             m_encoding == wxFONTENCODING_DEFAULT )
2808     {
2809         // don't convert at all
2810         return NULL;
2811     }
2812
2813     // we trust OS to do conversion better than we can so try external
2814     // conversion methods first
2815     //
2816     // the full order is:
2817     //      1. OS conversion (iconv() under Unix or Win32 API)
2818     //      2. hard coded conversions for UTF
2819     //      3. wxEncodingConverter as fall back
2820
2821     // step (1)
2822 #ifdef HAVE_ICONV
2823 #if !wxUSE_FONTMAP
2824     if ( m_name )
2825 #endif // !wxUSE_FONTMAP
2826     {
2827 #if wxUSE_FONTMAP
2828         wxFontEncoding encoding(m_encoding);
2829 #endif
2830
2831         if ( m_name )
2832         {
2833             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2834             if ( conv->IsOk() )
2835                 return conv;
2836
2837             delete conv;
2838
2839 #if wxUSE_FONTMAP
2840             encoding =
2841                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2842 #endif // wxUSE_FONTMAP
2843         }
2844 #if wxUSE_FONTMAP
2845         {
2846             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2847             if ( it != gs_nameCache.end() )
2848             {
2849                 if ( it->second.empty() )
2850                     return NULL;
2851
2852                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2853                 if ( conv->IsOk() )
2854                     return conv;
2855
2856                 delete conv;
2857             }
2858
2859             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2860             // CS : in case this does not return valid names (eg for MacRoman)
2861             // encoding got a 'failure' entry in the cache all the same,
2862             // although it just has to be created using a different method, so
2863             // only store failed iconv creation attempts (or perhaps we
2864             // shoulnd't do this at all ?)
2865             if ( names[0] != NULL )
2866             {
2867                 for ( ; *names; ++names )
2868                 {
2869                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2870                     //             will need changes that will obsolete this
2871                     wxString name(*names);
2872                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2873                     if ( conv->IsOk() )
2874                     {
2875                         gs_nameCache[encoding] = *names;
2876                         return conv;
2877                     }
2878
2879                     delete conv;
2880                 }
2881
2882                 gs_nameCache[encoding] = _T(""); // cache the failure
2883             }
2884         }
2885 #endif // wxUSE_FONTMAP
2886     }
2887 #endif // HAVE_ICONV
2888
2889 #ifdef wxHAVE_WIN32_MB2WC
2890     {
2891 #if wxUSE_FONTMAP
2892         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2893                                       : new wxMBConv_win32(m_encoding);
2894         if ( conv->IsOk() )
2895             return conv;
2896
2897         delete conv;
2898 #else
2899         return NULL;
2900 #endif
2901     }
2902 #endif // wxHAVE_WIN32_MB2WC
2903
2904 #ifdef __DARWIN__
2905     {
2906         // leave UTF16 and UTF32 to the built-ins of wx
2907         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2908             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2909         {
2910 #if wxUSE_FONTMAP
2911             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2912                                           : new wxMBConv_cf(m_encoding);
2913 #else
2914             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2915 #endif
2916
2917             if ( conv->IsOk() )
2918                  return conv;
2919
2920             delete conv;
2921         }
2922     }
2923 #endif // __DARWIN__
2924
2925     // step (2)
2926     wxFontEncoding enc = m_encoding;
2927 #if wxUSE_FONTMAP
2928     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2929     {
2930         // use "false" to suppress interactive dialogs -- we can be called from
2931         // anywhere and popping up a dialog from here is the last thing we want to
2932         // do
2933         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2934     }
2935 #endif // wxUSE_FONTMAP
2936
2937     switch ( enc )
2938     {
2939         case wxFONTENCODING_UTF7:
2940              return new wxMBConvUTF7;
2941
2942         case wxFONTENCODING_UTF8:
2943              return new wxMBConvUTF8;
2944
2945         case wxFONTENCODING_UTF16BE:
2946              return new wxMBConvUTF16BE;
2947
2948         case wxFONTENCODING_UTF16LE:
2949              return new wxMBConvUTF16LE;
2950
2951         case wxFONTENCODING_UTF32BE:
2952              return new wxMBConvUTF32BE;
2953
2954         case wxFONTENCODING_UTF32LE:
2955              return new wxMBConvUTF32LE;
2956
2957         default:
2958              // nothing to do but put here to suppress gcc warnings
2959              break;
2960     }
2961
2962     // step (3)
2963 #if wxUSE_FONTMAP
2964     {
2965         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2966                                       : new wxMBConv_wxwin(m_encoding);
2967         if ( conv->IsOk() )
2968             return conv;
2969
2970         delete conv;
2971     }
2972 #endif // wxUSE_FONTMAP
2973
2974     // NB: This is a hack to prevent deadlock. What could otherwise happen
2975     //     in Unicode build: wxConvLocal creation ends up being here
2976     //     because of some failure and logs the error. But wxLog will try to
2977     //     attach a timestamp, for which it will need wxConvLocal (to convert
2978     //     time to char* and then wchar_t*), but that fails, tries to log the
2979     //     error, but wxLog has an (already locked) critical section that
2980     //     guards the static buffer.
2981     static bool alreadyLoggingError = false;
2982     if (!alreadyLoggingError)
2983     {
2984         alreadyLoggingError = true;
2985         wxLogError(_("Cannot convert from the charset '%s'!"),
2986                    m_name ? m_name
2987                       :
2988 #if wxUSE_FONTMAP
2989                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2990 #else // !wxUSE_FONTMAP
2991                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2992 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2993               );
2994
2995         alreadyLoggingError = false;
2996     }
2997
2998     return NULL;
2999 }
3000
3001 void wxCSConv::CreateConvIfNeeded() const
3002 {
3003     if ( m_deferred )
3004     {
3005         wxCSConv *self = (wxCSConv *)this; // const_cast
3006
3007         // if we don't have neither the name nor the encoding, use the default
3008         // encoding for this system
3009         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3010         {
3011 #if wxUSE_INTL
3012             self->m_encoding = wxLocale::GetSystemEncoding();
3013 #else
3014             // fallback to some reasonable default:
3015             self->m_encoding = wxFONTENCODING_ISO8859_1;
3016 #endif // wxUSE_INTL
3017         }
3018
3019         self->m_convReal = DoCreate();
3020         self->m_deferred = false;
3021     }
3022 }
3023
3024 bool wxCSConv::IsOk() const
3025 {
3026     CreateConvIfNeeded();
3027
3028     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3029     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3030         return true; // always ok as we do it ourselves
3031
3032     // m_convReal->IsOk() is called at its own creation, so we know it must
3033     // be ok if m_convReal is non-NULL
3034     return m_convReal != NULL;
3035 }
3036
3037 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3038                          const char *src, size_t srcLen) const
3039 {
3040     CreateConvIfNeeded();
3041
3042     if (m_convReal)
3043         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3044
3045     // latin-1 (direct)
3046     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3047 }
3048
3049 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3050                            const wchar_t *src, size_t srcLen) const
3051 {
3052     CreateConvIfNeeded();
3053
3054     if (m_convReal)
3055         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3056
3057     // latin-1 (direct)
3058     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3059 }
3060
3061 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3062 {
3063     CreateConvIfNeeded();
3064
3065     if (m_convReal)
3066         return m_convReal->MB2WC(buf, psz, n);
3067
3068     // latin-1 (direct)
3069     size_t len = strlen(psz);
3070
3071     if (buf)
3072     {
3073         for (size_t c = 0; c <= len; c++)
3074             buf[c] = (unsigned char)(psz[c]);
3075     }
3076
3077     return len;
3078 }
3079
3080 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3081 {
3082     CreateConvIfNeeded();
3083
3084     if (m_convReal)
3085         return m_convReal->WC2MB(buf, psz, n);
3086
3087     // latin-1 (direct)
3088     const size_t len = wxWcslen(psz);
3089     if (buf)
3090     {
3091         for (size_t c = 0; c <= len; c++)
3092         {
3093             if (psz[c] > 0xFF)
3094                 return wxCONV_FAILED;
3095
3096             buf[c] = (char)psz[c];
3097         }
3098     }
3099     else
3100     {
3101         for (size_t c = 0; c <= len; c++)
3102         {
3103             if (psz[c] > 0xFF)
3104                 return wxCONV_FAILED;
3105         }
3106     }
3107
3108     return len;
3109 }
3110
3111 size_t wxCSConv::GetMBNulLen() const
3112 {
3113     CreateConvIfNeeded();
3114
3115     if ( m_convReal )
3116     {
3117         return m_convReal->GetMBNulLen();
3118     }
3119
3120     // otherwise, we are ISO-8859-1
3121     return 1;
3122 }
3123
3124 #if wxUSE_UNICODE_UTF8
3125 bool wxCSConv::IsUTF8() const
3126 {
3127     CreateConvIfNeeded();
3128
3129     if ( m_convReal )
3130     {
3131         return m_convReal->IsUTF8();
3132     }
3133
3134     // otherwise, we are ISO-8859-1
3135     return false;
3136 }
3137 #endif
3138
3139
3140 #if wxUSE_UNICODE
3141
3142 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3143 {
3144     if ( !s )
3145         return wxWCharBuffer();
3146
3147     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3148     if ( !wbuf )
3149         wbuf = wxMBConvUTF8().cMB2WX(s);
3150     if ( !wbuf )
3151         wbuf = wxConvISO8859_1.cMB2WX(s);
3152
3153     return wbuf;
3154 }
3155
3156 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3157 {
3158     if ( !ws )
3159         return wxCharBuffer();
3160
3161     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3162     if ( !buf )
3163         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3164
3165     return buf;
3166 }
3167
3168 #endif // wxUSE_UNICODE
3169
3170 // ----------------------------------------------------------------------------
3171 // globals
3172 // ----------------------------------------------------------------------------
3173
3174 // NB: The reason why we create converted objects in this convoluted way,
3175 //     using a factory function instead of global variable, is that they
3176 //     may be used at static initialization time (some of them are used by
3177 //     wxString ctors and there may be a global wxString object). In other
3178 //     words, possibly _before_ the converter global object would be
3179 //     initialized.
3180
3181 #undef wxConvLibc
3182 #undef wxConvUTF8
3183 #undef wxConvUTF7
3184 #undef wxConvLocal
3185 #undef wxConvISO8859_1
3186
3187 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3188     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3189     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3190     {                                                                   \
3191         static impl_klass name##Obj ctor_args;                          \
3192         return &name##Obj;                                              \
3193     }                                                                   \
3194     /* this ensures that all global converter objects are created */    \
3195     /* by the time static initialization is done, i.e. before any */    \
3196     /* thread is launched: */                                           \
3197     static klass* gs_##name##instance = wxGet_##name##Ptr()
3198
3199 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3200     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3201
3202 #ifdef __WINDOWS__
3203     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3204 #else
3205     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3206 #endif
3207
3208 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3209 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3210 //     provokes an error message about "not enough macro parameters"; and we
3211 //     can't use "()" here as the name##Obj declaration would be parsed as a
3212 //     function declaration then, so use a semicolon and live with an extra
3213 //     empty statement (and hope that no compilers warns about this)
3214 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3215 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3216
3217 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3218 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3219
3220 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3221 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3222
3223 #ifdef __DARWIN__
3224 // The xnu kernel always communicates file paths in decomposed UTF-8.
3225 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3226 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3227 #endif
3228
3229 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3230 #ifdef __DARWIN__
3231                                     &wxConvMacUTF8DObj;
3232 #else // !__DARWIN__
3233                                     wxGet_wxConvLibcPtr();
3234 #endif // __DARWIN__/!__DARWIN__
3235
3236 #else // !wxUSE_WCHAR_T
3237
3238 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3239 // stand-ins in absence of wchar_t
3240 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3241                                 wxConvISO8859_1,
3242                                 wxConvLocal,
3243                                 wxConvUTF8;
3244
3245 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T