src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef __SALFORDC__
  48     #include <clib.h>
  49 #endif
  50
  51 #ifdef HAVE_ICONV
  52     #include <iconv.h>
  53     #include "wx/thread.h"
  54 #endif
  55
  56 #include "wx/encconv.h"
  57 #include "wx/fontmap.h"
  58
  59 #ifdef __DARWIN__
  60 #include "wx/mac/corefoundation/private/strconv_cf.h"
  61 #endif //def __DARWIN__
  62
  63
  64 #define TRACE_STRCONV _T("strconv")
  65
  66 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  67 // be 4 bytes
  68 #if SIZEOF_WCHAR_T == 2
  69     #define WC_UTF16
  70 #endif
  71
  72
  73 // ============================================================================
  74 // implementation
  75 // ============================================================================
  76
  77 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  78 static bool NotAllNULs(const char *p, size_t n)
  79 {
  80     while ( n && *p++ == '\0' )
  81         n--;
  82
  83     return n != 0;
  84 }
  85
  86 // ----------------------------------------------------------------------------
  87 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  88 // ----------------------------------------------------------------------------
  89
  90 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  91 {
  92     if (input <= 0xffff)
  93     {
  94         if (output)
  95             *output = (wxUint16) input;
  96
  97         return 1;
  98     }
  99     else if (input >= 0x110000)
 100     {
 101         return wxCONV_FAILED;
 102     }
 103     else
 104     {
 105         if (output)
 106         {
 107             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 108             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 109         }
 110
 111         return 2;
 112     }
 113 }
 114
 115 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 116 {
 117     if ((*input < 0xd800) || (*input > 0xdfff))
 118     {
 119         output = *input;
 120         return 1;
 121     }
 122     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 123     {
 124         output = *input;
 125         return wxCONV_FAILED;
 126     }
 127     else
 128     {
 129         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 130         return 2;
 131     }
 132 }
 133
 134 #ifdef WC_UTF16
 135     typedef wchar_t wxDecodeSurrogate_t;
 136 #else // !WC_UTF16
 137     typedef wxUint16 wxDecodeSurrogate_t;
 138 #endif // WC_UTF16/!WC_UTF16
 139
 140 // returns the next UTF-32 character from the wchar_t buffer and advances the
 141 // pointer to the character after this one
 142 //
 143 // if an invalid character is found, *pSrc is set to NULL, the caller must
 144 // check for this
 145 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 146 {
 147     wxUint32 out;
 148     const size_t
 149         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 150     if ( n == wxCONV_FAILED )
 151         *pSrc = NULL;
 152     else
 153         *pSrc += n;
 154
 155     return out;
 156 }
 157
 158 // ----------------------------------------------------------------------------
 159 // wxMBConv
 160 // ----------------------------------------------------------------------------
 161
 162 size_t
 163 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 164                   const char *src, size_t srcLen) const
 165 {
 166     // although new conversion classes are supposed to implement this function
 167     // directly, the existins ones only implement the old MB2WC() and so, to
 168     // avoid to have to rewrite all conversion classes at once, we provide a
 169     // default (but not efficient) implementation of this one in terms of the
 170     // old function by copying the input to ensure that it's NUL-terminated and
 171     // then using MB2WC() to convert it
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     for ( ;; )
 213     {
 214         // try to convert the current chunk
 215         size_t lenChunk = MB2WC(NULL, src, 0);
 216         if ( lenChunk == wxCONV_FAILED )
 217             return wxCONV_FAILED;
 218
 219         lenChunk++; // for the L'\0' at the end of this chunk
 220
 221         dstWritten += lenChunk;
 222
 223         if ( lenChunk == 1 )
 224         {
 225             // nothing left in the input string, conversion succeeded
 226             break;
 227         }
 228
 229         if ( dst )
 230         {
 231             if ( dstWritten > dstLen )
 232                 return wxCONV_FAILED;
 233
 234             if ( MB2WC(dst, src, lenChunk) == wxCONV_FAILED )
 235                 return wxCONV_FAILED;
 236
 237             dst += lenChunk;
 238         }
 239
 240         if ( !srcEnd )
 241         {
 242             // we convert just one chunk in this case as this is the entire
 243             // string anyhow
 244             break;
 245         }
 246
 247         // advance the input pointer past the end of this chunk
 248         while ( NotAllNULs(src, nulLen) )
 249         {
 250             // notice that we must skip over multiple bytes here as we suppose
 251             // that if NUL takes 2 or 4 bytes, then all the other characters do
 252             // too and so if advanced by a single byte we might erroneously
 253             // detect sequences of NUL bytes in the middle of the input
 254             src += nulLen;
 255         }
 256
 257         src += nulLen; // skipping over its terminator as well
 258
 259         // note that ">=" (and not just "==") is needed here as the terminator
 260         // we skipped just above could be inside or just after the buffer
 261         // delimited by inEnd
 262         if ( src >= srcEnd )
 263             break;
 264     }
 265
 266     return dstWritten;
 267 }
 268
 269 size_t
 270 wxMBConv::FromWChar(char *dst, size_t dstLen,
 271                     const wchar_t *src, size_t srcLen) const
 272 {
 273     // the number of chars [which would be] written to dst [if it were not NULL]
 274     size_t dstWritten = 0;
 275
 276     // make a copy of the input string unless it is already properly
 277     // NUL-terminated
 278     //
 279     // if we don't know its length we have no choice but to assume that it is,
 280     // indeed, properly terminated
 281     wxWCharBuffer bufTmp;
 282     if ( srcLen == wxNO_LEN )
 283     {
 284         srcLen = wxWcslen(src) + 1;
 285     }
 286     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 287     {
 288         // make a copy in order to properly NUL-terminate the string
 289         bufTmp = wxWCharBuffer(srcLen);
 290         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 291         src = bufTmp;
 292     }
 293
 294     const size_t lenNul = GetMBNulLen();
 295     for ( const wchar_t * const srcEnd = src + srcLen;
 296           src < srcEnd;
 297           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 298     {
 299         // try to convert the current chunk
 300         size_t lenChunk = WC2MB(NULL, src, 0);
 301
 302         if ( lenChunk == wxCONV_FAILED )
 303             return wxCONV_FAILED;
 304
 305         lenChunk += lenNul;
 306         dstWritten += lenChunk;
 307
 308         if ( dst )
 309         {
 310             if ( dstWritten > dstLen )
 311                 return wxCONV_FAILED;
 312
 313             if ( WC2MB(dst, src, lenChunk) == wxCONV_FAILED )
 314                 return wxCONV_FAILED;
 315
 316             dst += lenChunk;
 317         }
 318     }
 319
 320     return dstWritten;
 321 }
 322
 323 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 324 {
 325     size_t rc = ToWChar(outBuff, outLen, inBuff);
 326     if ( rc != wxCONV_FAILED )
 327     {
 328         // ToWChar() returns the buffer length, i.e. including the trailing
 329         // NUL, while this method doesn't take it into account
 330         rc--;
 331     }
 332
 333     return rc;
 334 }
 335
 336 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 337 {
 338     size_t rc = FromWChar(outBuff, outLen, inBuff);
 339     if ( rc != wxCONV_FAILED )
 340     {
 341         rc -= GetMBNulLen();
 342     }
 343
 344     return rc;
 345 }
 346
 347 wxMBConv::~wxMBConv()
 348 {
 349     // nothing to do here (necessary for Darwin linking probably)
 350 }
 351
 352 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 353 {
 354     if ( psz )
 355     {
 356         // calculate the length of the buffer needed first
 357         const size_t nLen = ToWChar(NULL, 0, psz);
 358         if ( nLen != wxCONV_FAILED )
 359         {
 360             // now do the actual conversion
 361             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 362
 363             // +1 for the trailing NULL
 364             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 365                 return buf;
 366         }
 367     }
 368
 369     return wxWCharBuffer();
 370 }
 371
 372 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 373 {
 374     if ( pwz )
 375     {
 376         const size_t nLen = FromWChar(NULL, 0, pwz);
 377         if ( nLen != wxCONV_FAILED )
 378         {
 379             wxCharBuffer buf(nLen - 1);
 380             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 381                 return buf;
 382         }
 383     }
 384
 385     return wxCharBuffer();
 386 }
 387
 388 const wxWCharBuffer
 389 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 390 {
 391     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 392     if ( dstLen != wxCONV_FAILED )
 393     {
 394         wxWCharBuffer wbuf(dstLen - 1);
 395         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 396         {
 397             if ( outLen )
 398             {
 399                 *outLen = dstLen;
 400                 if ( wbuf[dstLen - 1] == L'\0' )
 401                     (*outLen)--;
 402             }
 403
 404             return wbuf;
 405         }
 406     }
 407
 408     if ( outLen )
 409         *outLen = 0;
 410
 411     return wxWCharBuffer();
 412 }
 413
 414 const wxCharBuffer
 415 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 416 {
 417     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 418     if ( dstLen != wxCONV_FAILED )
 419     {
 420         // special case of empty input: can't allocate 0 size buffer below as
 421         // wxCharBuffer insists on NUL-terminating it
 422         wxCharBuffer buf(dstLen ? dstLen - 1 : 1);
 423         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 424         {
 425             if ( outLen )
 426             {
 427                 *outLen = dstLen;
 428
 429                 const size_t nulLen = GetMBNulLen();
 430                 if ( dstLen >= nulLen &&
 431                         !NotAllNULs(buf.data() + dstLen - nulLen, nulLen) )
 432                 {
 433                     // in this case the output is NUL-terminated and we're not
 434                     // supposed to count NUL
 435                     *outLen -= nulLen;
 436                 }
 437             }
 438
 439             return buf;
 440         }
 441     }
 442
 443     if ( outLen )
 444         *outLen = 0;
 445
 446     return wxCharBuffer();
 447 }
 448
 449 // ----------------------------------------------------------------------------
 450 // wxMBConvLibc
 451 // ----------------------------------------------------------------------------
 452
 453 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 454 {
 455     return wxMB2WC(buf, psz, n);
 456 }
 457
 458 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 459 {
 460     return wxWC2MB(buf, psz, n);
 461 }
 462
 463 // ----------------------------------------------------------------------------
 464 // wxConvBrokenFileNames
 465 // ----------------------------------------------------------------------------
 466
 467 #ifdef __UNIX__
 468
 469 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 470 {
 471     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 472          wxStricmp(charset, _T("UTF8")) == 0  )
 473         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 474     else
 475         m_conv = new wxCSConv(charset);
 476 }
 477
 478 #endif // __UNIX__
 479
 480 // ----------------------------------------------------------------------------
 481 // UTF-7
 482 // ----------------------------------------------------------------------------
 483
 484 // Implementation (C) 2004 Fredrik Roubert
 485
 486 //
 487 // BASE64 decoding table
 488 //
 489 static const unsigned char utf7unb64[] =
 490 {
 491     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 492     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 494     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 495     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 496     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 497     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 498     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 499     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 500     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 501     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 502     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 503     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 504     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 505     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 506     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 507     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 510     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 513     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 516     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 519     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 520     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 521     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 522     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 523 };
 524
 525 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 526 {
 527     size_t len = 0;
 528
 529     while ( *psz && (!buf || (len < n)) )
 530     {
 531         unsigned char cc = *psz++;
 532         if (cc != '+')
 533         {
 534             // plain ASCII char
 535             if (buf)
 536                 *buf++ = cc;
 537             len++;
 538         }
 539         else if (*psz == '-')
 540         {
 541             // encoded plus sign
 542             if (buf)
 543                 *buf++ = cc;
 544             len++;
 545             psz++;
 546         }
 547         else // start of BASE64 encoded string
 548         {
 549             bool lsb, ok;
 550             unsigned int d, l;
 551             for ( ok = lsb = false, d = 0, l = 0;
 552                   (cc = utf7unb64[(unsigned char)*psz]) != 0xff;
 553                   psz++ )
 554             {
 555                 d <<= 6;
 556                 d += cc;
 557                 for (l += 6; l >= 8; lsb = !lsb)
 558                 {
 559                     unsigned char c = (unsigned char)((d >> (l -= 8)) % 256);
 560                     if (lsb)
 561                     {
 562                         if (buf)
 563                             *buf++ |= c;
 564                         len ++;
 565                     }
 566                     else
 567                     {
 568                         if (buf)
 569                             *buf = (wchar_t)(c << 8);
 570                     }
 571
 572                     ok = true;
 573                 }
 574             }
 575
 576             if ( !ok )
 577             {
 578                 // in valid UTF7 we should have valid characters after '+'
 579                 return wxCONV_FAILED;
 580             }
 581
 582             if (*psz == '-')
 583                 psz++;
 584         }
 585     }
 586
 587     if ( buf && (len < n) )
 588         *buf = '\0';
 589
 590     return len;
 591 }
 592
 593 //
 594 // BASE64 encoding table
 595 //
 596 static const unsigned char utf7enb64[] =
 597 {
 598     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 599     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 600     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 601     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 602     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 603     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 604     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 605     '4', '5', '6', '7', '8', '9', '+', '/'
 606 };
 607
 608 //
 609 // UTF-7 encoding table
 610 //
 611 // 0 - Set D (directly encoded characters)
 612 // 1 - Set O (optional direct characters)
 613 // 2 - whitespace characters (optional)
 614 // 3 - special characters
 615 //
 616 static const unsigned char utf7encode[128] =
 617 {
 618     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 619     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 620     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 621     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 622     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 623     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 624     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 625     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 626 };
 627
 628 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 629 {
 630     size_t len = 0;
 631
 632     while (*psz && ((!buf) || (len < n)))
 633     {
 634         wchar_t cc = *psz++;
 635         if (cc < 0x80 && utf7encode[cc] < 1)
 636         {
 637             // plain ASCII char
 638             if (buf)
 639                 *buf++ = (char)cc;
 640
 641             len++;
 642         }
 643 #ifndef WC_UTF16
 644         else if (((wxUint32)cc) > 0xffff)
 645         {
 646             // no surrogate pair generation (yet?)
 647             return wxCONV_FAILED;
 648         }
 649 #endif
 650         else
 651         {
 652             if (buf)
 653                 *buf++ = '+';
 654
 655             len++;
 656             if (cc != '+')
 657             {
 658                 // BASE64 encode string
 659                 unsigned int lsb, d, l;
 660                 for (d = 0, l = 0; /*nothing*/; psz++)
 661                 {
 662                     for (lsb = 0; lsb < 2; lsb ++)
 663                     {
 664                         d <<= 8;
 665                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 666
 667                         for (l += 8; l >= 6; )
 668                         {
 669                             l -= 6;
 670                             if (buf)
 671                                 *buf++ = utf7enb64[(d >> l) % 64];
 672                             len++;
 673                         }
 674                     }
 675
 676                     cc = *psz;
 677                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 678                         break;
 679                 }
 680
 681                 if (l != 0)
 682                 {
 683                     if (buf)
 684                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 685
 686                     len++;
 687                 }
 688             }
 689
 690             if (buf)
 691                 *buf++ = '-';
 692             len++;
 693         }
 694     }
 695
 696     if (buf && (len < n))
 697         *buf = 0;
 698
 699     return len;
 700 }
 701
 702 // ----------------------------------------------------------------------------
 703 // UTF-8
 704 // ----------------------------------------------------------------------------
 705
 706 static const wxUint32 utf8_max[]=
 707     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 708
 709 // boundaries of the private use area we use to (temporarily) remap invalid
 710 // characters invalid in a UTF-8 encoded string
 711 const wxUint32 wxUnicodePUA = 0x100000;
 712 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 713
 714 // this table gives the length of the UTF-8 encoding from its first character:
 715 const unsigned char tableUtf8Lengths[256] = {
 716     // single-byte sequences (ASCII):
 717     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 718     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 719     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 720     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 721     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 722     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 723     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 724     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 725
 726     // these are invalid:
 727     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 728     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 729     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 731     0, 0,                                            // C0,C1
 732
 733     // two-byte sequences:
 734           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 735     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 736
 737     // three-byte sequences:
 738     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 739
 740     // four-byte sequences:
 741     4, 4, 4, 4, 4,                                   // F0..F4
 742
 743     // these are invalid again (5- or 6-byte
 744     // sequences and sequences for code points
 745     // above U+10FFFF, as restricted by RFC 3629):
 746                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 747 };
 748
 749 size_t
 750 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 751                             const char *src, size_t srcLen) const
 752 {
 753     wchar_t *out = dstLen ? dst : NULL;
 754     size_t written = 0;
 755
 756     if ( srcLen == wxNO_LEN )
 757         srcLen = strlen(src) + 1;
 758
 759     for ( const char *p = src; ; p++ )
 760     {
 761         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 762         {
 763             // all done successfully, just add the trailing NULL if we are not
 764             // using explicit length
 765             if ( srcLen == wxNO_LEN )
 766             {
 767                 if ( out )
 768                 {
 769                     if ( !dstLen )
 770                         break;
 771
 772                     *out = L'\0';
 773                 }
 774
 775                 written++;
 776             }
 777
 778             return written;
 779         }
 780
 781         if ( out && !dstLen-- )
 782             break;
 783
 784         wxUint32 code;
 785         unsigned char c = *p;
 786
 787         if ( c < 0x80 )
 788         {
 789             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 790                 break;
 791
 792             if ( srcLen != wxNO_LEN )
 793                 srcLen--;
 794
 795             code = c;
 796         }
 797         else
 798         {
 799             unsigned len = tableUtf8Lengths[c];
 800             if ( !len )
 801                 break;
 802
 803             if ( srcLen < len ) // the test works for wxNO_LEN too
 804                 break;
 805
 806             if ( srcLen != wxNO_LEN )
 807                 srcLen -= len;
 808
 809             //   Char. number range   |        UTF-8 octet sequence
 810             //      (hexadecimal)     |              (binary)
 811             //  ----------------------+----------------------------------------
 812             //  0000 0000 - 0000 007F | 0xxxxxxx
 813             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 814             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 815             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 816             //
 817             //  Code point value is stored in bits marked with 'x',
 818             //  lowest-order bit of the value on the right side in the diagram
 819             //  above.                                         (from RFC 3629)
 820
 821             // mask to extract lead byte's value ('x' bits above), by sequence
 822             // length:
 823             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 824
 825             // mask and value of lead byte's most significant bits, by length:
 826             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 827             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 828
 829             len--; // it's more convenient to work with 0-based length here
 830
 831             // extract the lead byte's value bits:
 832             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 833                 break;
 834
 835             code = c & leadValueMask[len];
 836
 837             // all remaining bytes, if any, are handled in the same way
 838             // regardless of sequence's length:
 839             for ( ; len; --len )
 840             {
 841                 c = *++p;
 842                 if ( (c & 0xC0) != 0x80 )
 843                     return wxCONV_FAILED;
 844
 845                 code <<= 6;
 846                 code |= c & 0x3F;
 847             }
 848         }
 849
 850 #ifdef WC_UTF16
 851         // cast is ok because wchar_t == wxUint16 if WC_UTF16
 852         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
 853         {
 854             if ( out )
 855                 out++;
 856             written++;
 857         }
 858 #else // !WC_UTF16
 859         if ( out )
 860             *out = code;
 861 #endif // WC_UTF16/!WC_UTF16
 862
 863         if ( out )
 864             out++;
 865
 866         written++;
 867     }
 868
 869     return wxCONV_FAILED;
 870 }
 871
 872 size_t
 873 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
 874                               const wchar_t *src, size_t srcLen) const
 875 {
 876     char *out = dstLen ? dst : NULL;
 877     size_t written = 0;
 878
 879     for ( const wchar_t *wp = src; ; wp++ )
 880     {
 881         if ( !(srcLen == wxNO_LEN ? *wp : srcLen--) )
 882         {
 883             // all done successfully, just add the trailing NULL if we are not
 884             // using explicit length
 885             if ( srcLen == wxNO_LEN )
 886             {
 887                 if ( out )
 888                 {
 889                     if ( !dstLen )
 890                         break;
 891
 892                     *out = '\0';
 893                 }
 894
 895                 written++;
 896             }
 897
 898             return written;
 899         }
 900
 901
 902         wxUint32 code;
 903 #ifdef WC_UTF16
 904         // cast is ok for WC_UTF16
 905         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
 906         {
 907             // skip the next char too as we decoded a surrogate
 908             wp++;
 909         }
 910 #else // wchar_t is UTF-32
 911         code = *wp & 0x7fffffff;
 912 #endif
 913
 914         unsigned len;
 915         if ( code <= 0x7F )
 916         {
 917             len = 1;
 918             if ( out )
 919             {
 920                 if ( dstLen < len )
 921                     break;
 922
 923                 out[0] = (char)code;
 924             }
 925         }
 926         else if ( code <= 0x07FF )
 927         {
 928             len = 2;
 929             if ( out )
 930             {
 931                 if ( dstLen < len )
 932                     break;
 933
 934                 // NB: this line takes 6 least significant bits, encodes them as
 935                 // 10xxxxxx and discards them so that the next byte can be encoded:
 936                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 937                 out[0] = 0xC0 | code;
 938             }
 939         }
 940         else if ( code < 0xFFFF )
 941         {
 942             len = 3;
 943             if ( out )
 944             {
 945                 if ( dstLen < len )
 946                     break;
 947
 948                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 949                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 950                 out[0] = 0xE0 | code;
 951             }
 952         }
 953         else if ( code <= 0x10FFFF )
 954         {
 955             len = 4;
 956             if ( out )
 957             {
 958                 if ( dstLen < len )
 959                     break;
 960
 961                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
 962                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
 963                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
 964                 out[0] = 0xF0 | code;
 965             }
 966         }
 967         else
 968         {
 969             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
 970             break;
 971         }
 972
 973         if ( out )
 974         {
 975             out += len;
 976             dstLen -= len;
 977         }
 978
 979         written += len;
 980     }
 981
 982     // we only get here if an error occurs during decoding
 983     return wxCONV_FAILED;
 984 }
 985
 986 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 987 {
 988     if ( m_options == MAP_INVALID_UTF8_NOT )
 989         return wxMBConvStrictUTF8::MB2WC(buf, psz, n);
 990
 991     size_t len = 0;
 992
 993     while (*psz && ((!buf) || (len < n)))
 994     {
 995         const char *opsz = psz;
 996         bool invalid = false;
 997         unsigned char cc = *psz++, fc = cc;
 998         unsigned cnt;
 999         for (cnt = 0; fc & 0x80; cnt++)
1000             fc <<= 1;
1001
1002         if (!cnt)
1003         {
1004             // plain ASCII char
1005             if (buf)
1006                 *buf++ = cc;
1007             len++;
1008
1009             // escape the escape character for octal escapes
1010             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1011                     && cc == '\\' && (!buf || len < n))
1012             {
1013                 if (buf)
1014                     *buf++ = cc;
1015                 len++;
1016             }
1017         }
1018         else
1019         {
1020             cnt--;
1021             if (!cnt)
1022             {
1023                 // invalid UTF-8 sequence
1024                 invalid = true;
1025             }
1026             else
1027             {
1028                 unsigned ocnt = cnt - 1;
1029                 wxUint32 res = cc & (0x3f >> cnt);
1030                 while (cnt--)
1031                 {
1032                     cc = *psz;
1033                     if ((cc & 0xC0) != 0x80)
1034                     {
1035                         // invalid UTF-8 sequence
1036                         invalid = true;
1037                         break;
1038                     }
1039
1040                     psz++;
1041                     res = (res << 6) | (cc & 0x3f);
1042                 }
1043
1044                 if (invalid || res <= utf8_max[ocnt])
1045                 {
1046                     // illegal UTF-8 encoding
1047                     invalid = true;
1048                 }
1049                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1050                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1051                 {
1052                     // if one of our PUA characters turns up externally
1053                     // it must also be treated as an illegal sequence
1054                     // (a bit like you have to escape an escape character)
1055                     invalid = true;
1056                 }
1057                 else
1058                 {
1059 #ifdef WC_UTF16
1060                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1061                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1062                     if (pa == wxCONV_FAILED)
1063                     {
1064                         invalid = true;
1065                     }
1066                     else
1067                     {
1068                         if (buf)
1069                             buf += pa;
1070                         len += pa;
1071                     }
1072 #else // !WC_UTF16
1073                     if (buf)
1074                         *buf++ = (wchar_t)res;
1075                     len++;
1076 #endif // WC_UTF16/!WC_UTF16
1077                 }
1078             }
1079
1080             if (invalid)
1081             {
1082                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1083                 {
1084                     while (opsz < psz && (!buf || len < n))
1085                     {
1086 #ifdef WC_UTF16
1087                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1088                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1089                         wxASSERT(pa != wxCONV_FAILED);
1090                         if (buf)
1091                             buf += pa;
1092                         opsz++;
1093                         len += pa;
1094 #else
1095                         if (buf)
1096                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1097                         opsz++;
1098                         len++;
1099 #endif
1100                     }
1101                 }
1102                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1103                 {
1104                     while (opsz < psz && (!buf || len < n))
1105                     {
1106                         if ( buf && len + 3 < n )
1107                         {
1108                             unsigned char on = *opsz;
1109                             *buf++ = L'\\';
1110                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1111                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1112                             *buf++ = (wchar_t)( L'0' + on % 010 );
1113                         }
1114
1115                         opsz++;
1116                         len += 4;
1117                     }
1118                 }
1119                 else // MAP_INVALID_UTF8_NOT
1120                 {
1121                     return wxCONV_FAILED;
1122                 }
1123             }
1124         }
1125     }
1126
1127     if (buf && (len < n))
1128         *buf = 0;
1129
1130     return len;
1131 }
1132
1133 static inline bool isoctal(wchar_t wch)
1134 {
1135     return L'0' <= wch && wch <= L'7';
1136 }
1137
1138 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1139 {
1140     if ( m_options == MAP_INVALID_UTF8_NOT )
1141         return wxMBConvStrictUTF8::WC2MB(buf, psz, n);
1142
1143     size_t len = 0;
1144
1145     while (*psz && ((!buf) || (len < n)))
1146     {
1147         wxUint32 cc;
1148
1149 #ifdef WC_UTF16
1150         // cast is ok for WC_UTF16
1151         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1152         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1153 #else
1154         cc = (*psz++) & 0x7fffffff;
1155 #endif
1156
1157         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1158                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1159         {
1160             if (buf)
1161                 *buf++ = (char)(cc - wxUnicodePUA);
1162             len++;
1163         }
1164         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1165                     && cc == L'\\' && psz[0] == L'\\' )
1166         {
1167             if (buf)
1168                 *buf++ = (char)cc;
1169             psz++;
1170             len++;
1171         }
1172         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1173                     cc == L'\\' &&
1174                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1175         {
1176             if (buf)
1177             {
1178                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1179                                  (psz[1] - L'0') * 010 +
1180                                  (psz[2] - L'0'));
1181             }
1182
1183             psz += 3;
1184             len++;
1185         }
1186         else
1187         {
1188             unsigned cnt;
1189             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1190             {
1191             }
1192
1193             if (!cnt)
1194             {
1195                 // plain ASCII char
1196                 if (buf)
1197                     *buf++ = (char) cc;
1198                 len++;
1199             }
1200             else
1201             {
1202                 len += cnt + 1;
1203                 if (buf)
1204                 {
1205                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1206                     while (cnt--)
1207                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1208                 }
1209             }
1210         }
1211     }
1212
1213     if (buf && (len < n))
1214         *buf = 0;
1215
1216     return len;
1217 }
1218
1219 // ============================================================================
1220 // UTF-16
1221 // ============================================================================
1222
1223 #ifdef WORDS_BIGENDIAN
1224     #define wxMBConvUTF16straight wxMBConvUTF16BE
1225     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1226 #else
1227     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1228     #define wxMBConvUTF16straight wxMBConvUTF16LE
1229 #endif
1230
1231 /* static */
1232 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1233 {
1234     if ( srcLen == wxNO_LEN )
1235     {
1236         // count the number of bytes in input, including the trailing NULs
1237         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1238         for ( srcLen = 1; *inBuff++; srcLen++ )
1239             ;
1240
1241         srcLen *= BYTES_PER_CHAR;
1242     }
1243     else // we already have the length
1244     {
1245         // we can only convert an entire number of UTF-16 characters
1246         if ( srcLen % BYTES_PER_CHAR )
1247             return wxCONV_FAILED;
1248     }
1249
1250     return srcLen;
1251 }
1252
1253 // case when in-memory representation is UTF-16 too
1254 #ifdef WC_UTF16
1255
1256 // ----------------------------------------------------------------------------
1257 // conversions without endianness change
1258 // ----------------------------------------------------------------------------
1259
1260 size_t
1261 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1262                                const char *src, size_t srcLen) const
1263 {
1264     // set up the scene for using memcpy() (which is presumably more efficient
1265     // than copying the bytes one by one)
1266     srcLen = GetLength(src, srcLen);
1267     if ( srcLen == wxNO_LEN )
1268         return wxCONV_FAILED;
1269
1270     const size_t inLen = srcLen / BYTES_PER_CHAR;
1271     if ( dst )
1272     {
1273         if ( dstLen < inLen )
1274             return wxCONV_FAILED;
1275
1276         memcpy(dst, src, srcLen);
1277     }
1278
1279     return inLen;
1280 }
1281
1282 size_t
1283 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1284                                  const wchar_t *src, size_t srcLen) const
1285 {
1286     if ( srcLen == wxNO_LEN )
1287         srcLen = wxWcslen(src) + 1;
1288
1289     srcLen *= BYTES_PER_CHAR;
1290
1291     if ( dst )
1292     {
1293         if ( dstLen < srcLen )
1294             return wxCONV_FAILED;
1295
1296         memcpy(dst, src, srcLen);
1297     }
1298
1299     return srcLen;
1300 }
1301
1302 // ----------------------------------------------------------------------------
1303 // endian-reversing conversions
1304 // ----------------------------------------------------------------------------
1305
1306 size_t
1307 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1308                            const char *src, size_t srcLen) const
1309 {
1310     srcLen = GetLength(src, srcLen);
1311     if ( srcLen == wxNO_LEN )
1312         return wxCONV_FAILED;
1313
1314     srcLen /= BYTES_PER_CHAR;
1315
1316     if ( dst )
1317     {
1318         if ( dstLen < srcLen )
1319             return wxCONV_FAILED;
1320
1321         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1322         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1323         {
1324             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1325         }
1326     }
1327
1328     return srcLen;
1329 }
1330
1331 size_t
1332 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1333                              const wchar_t *src, size_t srcLen) const
1334 {
1335     if ( srcLen == wxNO_LEN )
1336         srcLen = wxWcslen(src) + 1;
1337
1338     srcLen *= BYTES_PER_CHAR;
1339
1340     if ( dst )
1341     {
1342         if ( dstLen < srcLen )
1343             return wxCONV_FAILED;
1344
1345         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1346         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1347         {
1348             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1349         }
1350     }
1351
1352     return srcLen;
1353 }
1354
1355 #else // !WC_UTF16: wchar_t is UTF-32
1356
1357 // ----------------------------------------------------------------------------
1358 // conversions without endianness change
1359 // ----------------------------------------------------------------------------
1360
1361 size_t
1362 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1363                                const char *src, size_t srcLen) const
1364 {
1365     srcLen = GetLength(src, srcLen);
1366     if ( srcLen == wxNO_LEN )
1367         return wxCONV_FAILED;
1368
1369     const size_t inLen = srcLen / BYTES_PER_CHAR;
1370     if ( !dst )
1371     {
1372         // optimization: return maximal space which could be needed for this
1373         // string even if the real size could be smaller if the buffer contains
1374         // any surrogates
1375         return inLen;
1376     }
1377
1378     size_t outLen = 0;
1379     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1380     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1381     {
1382         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1383         if ( !inBuff )
1384             return wxCONV_FAILED;
1385
1386         if ( ++outLen > dstLen )
1387             return wxCONV_FAILED;
1388
1389         *dst++ = ch;
1390     }
1391
1392
1393     return outLen;
1394 }
1395
1396 size_t
1397 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1398                                  const wchar_t *src, size_t srcLen) const
1399 {
1400     if ( srcLen == wxNO_LEN )
1401         srcLen = wxWcslen(src) + 1;
1402
1403     size_t outLen = 0;
1404     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1405     for ( size_t n = 0; n < srcLen; n++ )
1406     {
1407         wxUint16 cc[2];
1408         const size_t numChars = encode_utf16(*src++, cc);
1409         if ( numChars == wxCONV_FAILED )
1410             return wxCONV_FAILED;
1411
1412         outLen += numChars * BYTES_PER_CHAR;
1413         if ( outBuff )
1414         {
1415             if ( outLen > dstLen )
1416                 return wxCONV_FAILED;
1417
1418             *outBuff++ = cc[0];
1419             if ( numChars == 2 )
1420             {
1421                 // second character of a surrogate
1422                 *outBuff++ = cc[1];
1423             }
1424         }
1425     }
1426
1427     return outLen;
1428 }
1429
1430 // ----------------------------------------------------------------------------
1431 // endian-reversing conversions
1432 // ----------------------------------------------------------------------------
1433
1434 size_t
1435 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1436                            const char *src, size_t srcLen) const
1437 {
1438     srcLen = GetLength(src, srcLen);
1439     if ( srcLen == wxNO_LEN )
1440         return wxCONV_FAILED;
1441
1442     const size_t inLen = srcLen / BYTES_PER_CHAR;
1443     if ( !dst )
1444     {
1445         // optimization: return maximal space which could be needed for this
1446         // string even if the real size could be smaller if the buffer contains
1447         // any surrogates
1448         return inLen;
1449     }
1450
1451     size_t outLen = 0;
1452     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1453     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1454     {
1455         wxUint32 ch;
1456         wxUint16 tmp[2];
1457
1458         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1459         inBuff++;
1460         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1461
1462         const size_t numChars = decode_utf16(tmp, ch);
1463         if ( numChars == wxCONV_FAILED )
1464             return wxCONV_FAILED;
1465
1466         if ( numChars == 2 )
1467             inBuff++;
1468
1469         if ( ++outLen > dstLen )
1470             return wxCONV_FAILED;
1471
1472         *dst++ = ch;
1473     }
1474
1475
1476     return outLen;
1477 }
1478
1479 size_t
1480 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1481                              const wchar_t *src, size_t srcLen) const
1482 {
1483     if ( srcLen == wxNO_LEN )
1484         srcLen = wxWcslen(src) + 1;
1485
1486     size_t outLen = 0;
1487     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1488     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1489     {
1490         wxUint16 cc[2];
1491         const size_t numChars = encode_utf16(*src, cc);
1492         if ( numChars == wxCONV_FAILED )
1493             return wxCONV_FAILED;
1494
1495         outLen += numChars * BYTES_PER_CHAR;
1496         if ( outBuff )
1497         {
1498             if ( outLen > dstLen )
1499                 return wxCONV_FAILED;
1500
1501             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1502             if ( numChars == 2 )
1503             {
1504                 // second character of a surrogate
1505                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1506             }
1507         }
1508     }
1509
1510     return outLen;
1511 }
1512
1513 #endif // WC_UTF16/!WC_UTF16
1514
1515
1516 // ============================================================================
1517 // UTF-32
1518 // ============================================================================
1519
1520 #ifdef WORDS_BIGENDIAN
1521     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1522     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1523 #else
1524     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1525     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1526 #endif
1527
1528
1529 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1530 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1531
1532 /* static */
1533 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1534 {
1535     if ( srcLen == wxNO_LEN )
1536     {
1537         // count the number of bytes in input, including the trailing NULs
1538         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1539         for ( srcLen = 1; *inBuff++; srcLen++ )
1540             ;
1541
1542         srcLen *= BYTES_PER_CHAR;
1543     }
1544     else // we already have the length
1545     {
1546         // we can only convert an entire number of UTF-32 characters
1547         if ( srcLen % BYTES_PER_CHAR )
1548             return wxCONV_FAILED;
1549     }
1550
1551     return srcLen;
1552 }
1553
1554 // case when in-memory representation is UTF-16
1555 #ifdef WC_UTF16
1556
1557 // ----------------------------------------------------------------------------
1558 // conversions without endianness change
1559 // ----------------------------------------------------------------------------
1560
1561 size_t
1562 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1563                                const char *src, size_t srcLen) const
1564 {
1565     srcLen = GetLength(src, srcLen);
1566     if ( srcLen == wxNO_LEN )
1567         return wxCONV_FAILED;
1568
1569     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1570     const size_t inLen = srcLen / BYTES_PER_CHAR;
1571     size_t outLen = 0;
1572     for ( size_t n = 0; n < inLen; n++ )
1573     {
1574         wxUint16 cc[2];
1575         const size_t numChars = encode_utf16(*inBuff++, cc);
1576         if ( numChars == wxCONV_FAILED )
1577             return wxCONV_FAILED;
1578
1579         outLen += numChars;
1580         if ( dst )
1581         {
1582             if ( outLen > dstLen )
1583                 return wxCONV_FAILED;
1584
1585             *dst++ = cc[0];
1586             if ( numChars == 2 )
1587             {
1588                 // second character of a surrogate
1589                 *dst++ = cc[1];
1590             }
1591         }
1592     }
1593
1594     return outLen;
1595 }
1596
1597 size_t
1598 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1599                                  const wchar_t *src, size_t srcLen) const
1600 {
1601     if ( srcLen == wxNO_LEN )
1602         srcLen = wxWcslen(src) + 1;
1603
1604     if ( !dst )
1605     {
1606         // optimization: return maximal space which could be needed for this
1607         // string instead of the exact amount which could be less if there are
1608         // any surrogates in the input
1609         //
1610         // we consider that surrogates are rare enough to make it worthwhile to
1611         // avoid running the loop below at the cost of slightly extra memory
1612         // consumption
1613         return srcLen * BYTES_PER_CHAR;
1614     }
1615
1616     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1617     size_t outLen = 0;
1618     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1619     {
1620         const wxUint32 ch = wxDecodeSurrogate(&src);
1621         if ( !src )
1622             return wxCONV_FAILED;
1623
1624         outLen += BYTES_PER_CHAR;
1625
1626         if ( outLen > dstLen )
1627             return wxCONV_FAILED;
1628
1629         *outBuff++ = ch;
1630     }
1631
1632     return outLen;
1633 }
1634
1635 // ----------------------------------------------------------------------------
1636 // endian-reversing conversions
1637 // ----------------------------------------------------------------------------
1638
1639 size_t
1640 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1641                            const char *src, size_t srcLen) const
1642 {
1643     srcLen = GetLength(src, srcLen);
1644     if ( srcLen == wxNO_LEN )
1645         return wxCONV_FAILED;
1646
1647     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1648     const size_t inLen = srcLen / BYTES_PER_CHAR;
1649     size_t outLen = 0;
1650     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1651     {
1652         wxUint16 cc[2];
1653         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1654         if ( numChars == wxCONV_FAILED )
1655             return wxCONV_FAILED;
1656
1657         outLen += numChars;
1658         if ( dst )
1659         {
1660             if ( outLen > dstLen )
1661                 return wxCONV_FAILED;
1662
1663             *dst++ = cc[0];
1664             if ( numChars == 2 )
1665             {
1666                 // second character of a surrogate
1667                 *dst++ = cc[1];
1668             }
1669         }
1670     }
1671
1672     return outLen;
1673 }
1674
1675 size_t
1676 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1677                              const wchar_t *src, size_t srcLen) const
1678 {
1679     if ( srcLen == wxNO_LEN )
1680         srcLen = wxWcslen(src) + 1;
1681
1682     if ( !dst )
1683     {
1684         // optimization: return maximal space which could be needed for this
1685         // string instead of the exact amount which could be less if there are
1686         // any surrogates in the input
1687         //
1688         // we consider that surrogates are rare enough to make it worthwhile to
1689         // avoid running the loop below at the cost of slightly extra memory
1690         // consumption
1691         return srcLen*BYTES_PER_CHAR;
1692     }
1693
1694     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1695     size_t outLen = 0;
1696     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1697     {
1698         const wxUint32 ch = wxDecodeSurrogate(&src);
1699         if ( !src )
1700             return wxCONV_FAILED;
1701
1702         outLen += BYTES_PER_CHAR;
1703
1704         if ( outLen > dstLen )
1705             return wxCONV_FAILED;
1706
1707         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1708     }
1709
1710     return outLen;
1711 }
1712
1713 #else // !WC_UTF16: wchar_t is UTF-32
1714
1715 // ----------------------------------------------------------------------------
1716 // conversions without endianness change
1717 // ----------------------------------------------------------------------------
1718
1719 size_t
1720 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1721                                const char *src, size_t srcLen) const
1722 {
1723     // use memcpy() as it should be much faster than hand-written loop
1724     srcLen = GetLength(src, srcLen);
1725     if ( srcLen == wxNO_LEN )
1726         return wxCONV_FAILED;
1727
1728     const size_t inLen = srcLen/BYTES_PER_CHAR;
1729     if ( dst )
1730     {
1731         if ( dstLen < inLen )
1732             return wxCONV_FAILED;
1733
1734         memcpy(dst, src, srcLen);
1735     }
1736
1737     return inLen;
1738 }
1739
1740 size_t
1741 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1742                                  const wchar_t *src, size_t srcLen) const
1743 {
1744     if ( srcLen == wxNO_LEN )
1745         srcLen = wxWcslen(src) + 1;
1746
1747     srcLen *= BYTES_PER_CHAR;
1748
1749     if ( dst )
1750     {
1751         if ( dstLen < srcLen )
1752             return wxCONV_FAILED;
1753
1754         memcpy(dst, src, srcLen);
1755     }
1756
1757     return srcLen;
1758 }
1759
1760 // ----------------------------------------------------------------------------
1761 // endian-reversing conversions
1762 // ----------------------------------------------------------------------------
1763
1764 size_t
1765 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1766                            const char *src, size_t srcLen) const
1767 {
1768     srcLen = GetLength(src, srcLen);
1769     if ( srcLen == wxNO_LEN )
1770         return wxCONV_FAILED;
1771
1772     srcLen /= BYTES_PER_CHAR;
1773
1774     if ( dst )
1775     {
1776         if ( dstLen < srcLen )
1777             return wxCONV_FAILED;
1778
1779         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1780         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1781         {
1782             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1783         }
1784     }
1785
1786     return srcLen;
1787 }
1788
1789 size_t
1790 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1791                              const wchar_t *src, size_t srcLen) const
1792 {
1793     if ( srcLen == wxNO_LEN )
1794         srcLen = wxWcslen(src) + 1;
1795
1796     srcLen *= BYTES_PER_CHAR;
1797
1798     if ( dst )
1799     {
1800         if ( dstLen < srcLen )
1801             return wxCONV_FAILED;
1802
1803         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1804         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1805         {
1806             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1807         }
1808     }
1809
1810     return srcLen;
1811 }
1812
1813 #endif // WC_UTF16/!WC_UTF16
1814
1815
1816 // ============================================================================
1817 // The classes doing conversion using the iconv_xxx() functions
1818 // ============================================================================
1819
1820 #ifdef HAVE_ICONV
1821
1822 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1823 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1824 //     (unless there's yet another bug in glibc) the only case when iconv()
1825 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1826 //     left in the input buffer -- when _real_ error occurs,
1827 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1828 //     iconv() failure.
1829 //     [This bug does not appear in glibc 2.2.]
1830 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1831 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1832                                      (errno != E2BIG || bufLeft != 0))
1833 #else
1834 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1835 #endif
1836
1837 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1838
1839 #define ICONV_T_INVALID ((iconv_t)-1)
1840
1841 #if SIZEOF_WCHAR_T == 4
1842     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
1843     #define WC_ENC      wxFONTENCODING_UTF32
1844 #elif SIZEOF_WCHAR_T == 2
1845     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
1846     #define WC_ENC      wxFONTENCODING_UTF16
1847 #else // sizeof(wchar_t) != 2 nor 4
1848     // does this ever happen?
1849     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
1850 #endif
1851
1852 // ----------------------------------------------------------------------------
1853 // wxMBConv_iconv: encapsulates an iconv character set
1854 // ----------------------------------------------------------------------------
1855
1856 class wxMBConv_iconv : public wxMBConv
1857 {
1858 public:
1859     wxMBConv_iconv(const char *name);
1860     virtual ~wxMBConv_iconv();
1861
1862     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1863     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1864
1865     // classify this encoding as explained in wxMBConv::GetMBNulLen() comment
1866     virtual size_t GetMBNulLen() const;
1867
1868 #if wxUSE_UNICODE_UTF8
1869     virtual bool IsUTF8() const;
1870 #endif
1871
1872     virtual wxMBConv *Clone() const
1873     {
1874         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
1875         p->m_minMBCharWidth = m_minMBCharWidth;
1876         return p;
1877     }
1878
1879     bool IsOk() const
1880         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
1881
1882 protected:
1883     // the iconv handlers used to translate from multibyte
1884     // to wide char and in the other direction
1885     iconv_t m2w,
1886             w2m;
1887
1888 #if wxUSE_THREADS
1889     // guards access to m2w and w2m objects
1890     wxMutex m_iconvMutex;
1891 #endif
1892
1893 private:
1894     // the name (for iconv_open()) of a wide char charset -- if none is
1895     // available on this machine, it will remain NULL
1896     static wxString ms_wcCharsetName;
1897
1898     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1899     // different endian-ness than the native one
1900     static bool ms_wcNeedsSwap;
1901
1902
1903     // name of the encoding handled by this conversion
1904     wxString m_name;
1905
1906     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
1907     // initially
1908     size_t m_minMBCharWidth;
1909 };
1910
1911 // make the constructor available for unit testing
1912 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
1913 {
1914     wxMBConv_iconv* result = new wxMBConv_iconv( name );
1915     if ( !result->IsOk() )
1916     {
1917         delete result;
1918         return 0;
1919     }
1920
1921     return result;
1922 }
1923
1924 wxString wxMBConv_iconv::ms_wcCharsetName;
1925 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1926
1927 wxMBConv_iconv::wxMBConv_iconv(const char *name)
1928               : m_name(name)
1929 {
1930     m_minMBCharWidth = 0;
1931
1932     // check for charset that represents wchar_t:
1933     if ( ms_wcCharsetName.empty() )
1934     {
1935         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
1936
1937 #if wxUSE_FONTMAP
1938         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
1939 #else // !wxUSE_FONTMAP
1940         static const wxChar *names_static[] =
1941         {
1942 #if SIZEOF_WCHAR_T == 4
1943             _T("UCS-4"),
1944 #elif SIZEOF_WCHAR_T = 2
1945             _T("UCS-2"),
1946 #endif
1947             NULL
1948         };
1949         const wxChar **names = names_static;
1950 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
1951
1952         for ( ; *names && ms_wcCharsetName.empty(); ++names )
1953         {
1954             const wxString nameCS(*names);
1955
1956             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
1957             wxString nameXE(nameCS);
1958
1959 #ifdef WORDS_BIGENDIAN
1960                 nameXE += _T("BE");
1961 #else // little endian
1962                 nameXE += _T("LE");
1963 #endif
1964
1965             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1966                        nameXE.c_str());
1967
1968             m2w = iconv_open(nameXE.ToAscii(), name);
1969             if ( m2w == ICONV_T_INVALID )
1970             {
1971                 // try charset w/o bytesex info (e.g. "UCS4")
1972                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
1973                            nameCS.c_str());
1974                 m2w = iconv_open(nameCS.ToAscii(), name);
1975
1976                 // and check for bytesex ourselves:
1977                 if ( m2w != ICONV_T_INVALID )
1978                 {
1979                     char    buf[2], *bufPtr;
1980                     wchar_t wbuf[2], *wbufPtr;
1981                     size_t  insz, outsz;
1982                     size_t  res;
1983
1984                     buf[0] = 'A';
1985                     buf[1] = 0;
1986                     wbuf[0] = 0;
1987                     insz = 2;
1988                     outsz = SIZEOF_WCHAR_T * 2;
1989                     wbufPtr = wbuf;
1990                     bufPtr = buf;
1991
1992                     res = iconv(
1993                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1994                         (char**)&wbufPtr, &outsz);
1995
1996                     if (ICONV_FAILED(res, insz))
1997                     {
1998                         wxLogLastError(wxT("iconv"));
1999                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2000                                    nameCS.c_str());
2001                     }
2002                     else // ok, can convert to this encoding, remember it
2003                     {
2004                         ms_wcCharsetName = nameCS;
2005                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2006                     }
2007                 }
2008             }
2009             else // use charset not requiring byte swapping
2010             {
2011                 ms_wcCharsetName = nameXE;
2012             }
2013         }
2014
2015         wxLogTrace(TRACE_STRCONV,
2016                    wxT("iconv wchar_t charset is \"%s\"%s"),
2017                    ms_wcCharsetName.empty() ? wxString("<none>")
2018                                             : ms_wcCharsetName,
2019                    ms_wcNeedsSwap ? _T(" (needs swap)")
2020                                   : _T(""));
2021     }
2022     else // we already have ms_wcCharsetName
2023     {
2024         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2025     }
2026
2027     if ( ms_wcCharsetName.empty() )
2028     {
2029         w2m = ICONV_T_INVALID;
2030     }
2031     else
2032     {
2033         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2034         if ( w2m == ICONV_T_INVALID )
2035         {
2036             wxLogTrace(TRACE_STRCONV,
2037                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2038                        ms_wcCharsetName.c_str(), name);
2039         }
2040     }
2041 }
2042
2043 wxMBConv_iconv::~wxMBConv_iconv()
2044 {
2045     if ( m2w != ICONV_T_INVALID )
2046         iconv_close(m2w);
2047     if ( w2m != ICONV_T_INVALID )
2048         iconv_close(w2m);
2049 }
2050
2051 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2052 {
2053     // find the string length: notice that must be done differently for
2054     // NUL-terminated strings and UTF-16/32 which are terminated with 2/4 NULs
2055     size_t inbuf;
2056     const size_t nulLen = GetMBNulLen();
2057     switch ( nulLen )
2058     {
2059         default:
2060             return wxCONV_FAILED;
2061
2062         case 1:
2063             inbuf = strlen(psz); // arguably more optimized than our version
2064             break;
2065
2066         case 2:
2067         case 4:
2068             // for UTF-16/32 not only we need to have 2/4 consecutive NULs but
2069             // they also have to start at character boundary and not span two
2070             // adjacent characters
2071             const char *p;
2072             for ( p = psz; NotAllNULs(p, nulLen); p += nulLen )
2073                 ;
2074             inbuf = p - psz;
2075             break;
2076     }
2077
2078 #if wxUSE_THREADS
2079     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2080     //     Unfortunately there are a couple of global wxCSConv objects such as
2081     //     wxConvLocal that are used all over wx code, so we have to make sure
2082     //     the handle is used by at most one thread at the time. Otherwise
2083     //     only a few wx classes would be safe to use from non-main threads
2084     //     as MB<->WC conversion would fail "randomly".
2085     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2086 #endif // wxUSE_THREADS
2087
2088     size_t outbuf = n * SIZEOF_WCHAR_T;
2089     size_t res, cres;
2090     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
2091     wchar_t *bufPtr = buf;
2092     const char *pszPtr = psz;
2093
2094     if (buf)
2095     {
2096         // have destination buffer, convert there
2097         cres = iconv(m2w,
2098                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
2099                      (char**)&bufPtr, &outbuf);
2100         res = n - (outbuf / SIZEOF_WCHAR_T);
2101
2102         if (ms_wcNeedsSwap)
2103         {
2104             // convert to native endianness
2105             for ( unsigned i = 0; i < res; i++ )
2106                 buf[n] = WC_BSWAP(buf[i]);
2107         }
2108
2109         // NUL-terminate the string if there is any space left
2110         if (res < n)
2111             buf[res] = 0;
2112     }
2113     else
2114     {
2115         // no destination buffer... convert using temp buffer
2116         // to calculate destination buffer requirement
2117         wchar_t tbuf[8];
2118         res = 0;
2119
2120         do
2121         {
2122             bufPtr = tbuf;
2123             outbuf = 8 * SIZEOF_WCHAR_T;
2124
2125             cres = iconv(m2w,
2126                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
2127                          (char**)&bufPtr, &outbuf );
2128
2129             res += 8 - (outbuf / SIZEOF_WCHAR_T);
2130         }
2131         while ((cres == (size_t)-1) && (errno == E2BIG));
2132     }
2133
2134     if (ICONV_FAILED(cres, inbuf))
2135     {
2136         //VS: it is ok if iconv fails, hence trace only
2137         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2138         return wxCONV_FAILED;
2139     }
2140
2141     return res;
2142 }
2143
2144 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2145 {
2146 #if wxUSE_THREADS
2147     // NB: explained in MB2WC
2148     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2149 #endif
2150
2151     size_t inlen = wxWcslen(psz);
2152     size_t inbuf = inlen * SIZEOF_WCHAR_T;
2153     size_t outbuf = n;
2154     size_t res, cres;
2155
2156     wchar_t *tmpbuf = 0;
2157
2158     if (ms_wcNeedsSwap)
2159     {
2160         // need to copy to temp buffer to switch endianness
2161         // (doing WC_BSWAP twice on the original buffer won't help, as it
2162         //  could be in read-only memory, or be accessed in some other thread)
2163         tmpbuf = (wchar_t *)malloc(inbuf + SIZEOF_WCHAR_T);
2164         for ( size_t i = 0; i < inlen; i++ )
2165             tmpbuf[n] = WC_BSWAP(psz[i]);
2166
2167         tmpbuf[inlen] = L'\0';
2168         psz = tmpbuf;
2169     }
2170
2171     if (buf)
2172     {
2173         // have destination buffer, convert there
2174         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2175
2176         res = n - outbuf;
2177
2178         // NB: iconv was given only wcslen(psz) characters on input, and so
2179         //     it couldn't convert the trailing zero. Let's do it ourselves
2180         //     if there's some room left for it in the output buffer.
2181         if (res < n)
2182             buf[0] = 0;
2183     }
2184     else
2185     {
2186         // no destination buffer: convert using temp buffer
2187         // to calculate destination buffer requirement
2188         char tbuf[16];
2189         res = 0;
2190         do
2191         {
2192             buf = tbuf;
2193             outbuf = 16;
2194
2195             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
2196
2197             res += 16 - outbuf;
2198         }
2199         while ((cres == (size_t)-1) && (errno == E2BIG));
2200     }
2201
2202     if (ms_wcNeedsSwap)
2203     {
2204         free(tmpbuf);
2205     }
2206
2207     if (ICONV_FAILED(cres, inbuf))
2208     {
2209         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2210         return wxCONV_FAILED;
2211     }
2212
2213     return res;
2214 }
2215
2216 size_t wxMBConv_iconv::GetMBNulLen() const
2217 {
2218     if ( m_minMBCharWidth == 0 )
2219     {
2220         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2221
2222 #if wxUSE_THREADS
2223         // NB: explained in MB2WC
2224         wxMutexLocker lock(self->m_iconvMutex);
2225 #endif
2226
2227         const wchar_t *wnul = L"";
2228         char buf[8]; // should be enough for NUL in any encoding
2229         size_t inLen = sizeof(wchar_t),
2230                outLen = WXSIZEOF(buf);
2231         char *inBuff = (char *)wnul;
2232         char *outBuff = buf;
2233         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2234         {
2235             self->m_minMBCharWidth = (size_t)-1;
2236         }
2237         else // ok
2238         {
2239             self->m_minMBCharWidth = outBuff - buf;
2240         }
2241     }
2242
2243     return m_minMBCharWidth;
2244 }
2245
2246 #if wxUSE_UNICODE_UTF8
2247 bool wxMBConv_iconv::IsUTF8() const
2248 {
2249     return wxStricmp(m_name, "UTF-8") == 0 ||
2250            wxStricmp(m_name, "UTF8") == 0;
2251 }
2252 #endif
2253
2254 #endif // HAVE_ICONV
2255
2256
2257 // ============================================================================
2258 // Win32 conversion classes
2259 // ============================================================================
2260
2261 #ifdef wxHAVE_WIN32_MB2WC
2262
2263 // from utils.cpp
2264 #if wxUSE_FONTMAP
2265 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2266 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2267 #endif
2268
2269 class wxMBConv_win32 : public wxMBConv
2270 {
2271 public:
2272     wxMBConv_win32()
2273     {
2274         m_CodePage = CP_ACP;
2275         m_minMBCharWidth = 0;
2276     }
2277
2278     wxMBConv_win32(const wxMBConv_win32& conv)
2279         : wxMBConv()
2280     {
2281         m_CodePage = conv.m_CodePage;
2282         m_minMBCharWidth = conv.m_minMBCharWidth;
2283     }
2284
2285 #if wxUSE_FONTMAP
2286     wxMBConv_win32(const char* name)
2287     {
2288         m_CodePage = wxCharsetToCodepage(name);
2289         m_minMBCharWidth = 0;
2290     }
2291
2292     wxMBConv_win32(wxFontEncoding encoding)
2293     {
2294         m_CodePage = wxEncodingToCodepage(encoding);
2295         m_minMBCharWidth = 0;
2296     }
2297 #endif // wxUSE_FONTMAP
2298
2299     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2300     {
2301         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2302         // the behaviour is not compatible with the Unix version (using iconv)
2303         // and break the library itself, e.g. wxTextInputStream::NextChar()
2304         // wouldn't work if reading an incomplete MB char didn't result in an
2305         // error
2306         //
2307         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2308         // Win XP or newer and it is not supported for UTF-[78] so we always
2309         // use our own conversions in this case. See
2310         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2311         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2312         if ( m_CodePage == CP_UTF8 )
2313         {
2314             return wxMBConvUTF8().MB2WC(buf, psz, n);
2315         }
2316
2317         if ( m_CodePage == CP_UTF7 )
2318         {
2319             return wxMBConvUTF7().MB2WC(buf, psz, n);
2320         }
2321
2322         int flags = 0;
2323         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2324                 IsAtLeastWin2kSP4() )
2325         {
2326             flags = MB_ERR_INVALID_CHARS;
2327         }
2328
2329         const size_t len = ::MultiByteToWideChar
2330                              (
2331                                 m_CodePage,     // code page
2332                                 flags,          // flags: fall on error
2333                                 psz,            // input string
2334                                 -1,             // its length (NUL-terminated)
2335                                 buf,            // output string
2336                                 buf ? n : 0     // size of output buffer
2337                              );
2338         if ( !len )
2339         {
2340             // function totally failed
2341             return wxCONV_FAILED;
2342         }
2343
2344         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2345         // check if we succeeded, by doing a double trip:
2346         if ( !flags && buf )
2347         {
2348             const size_t mbLen = strlen(psz);
2349             wxCharBuffer mbBuf(mbLen);
2350             if ( ::WideCharToMultiByte
2351                    (
2352                       m_CodePage,
2353                       0,
2354                       buf,
2355                       -1,
2356                       mbBuf.data(),
2357                       mbLen + 1,        // size in bytes, not length
2358                       NULL,
2359                       NULL
2360                    ) == 0 ||
2361                   strcmp(mbBuf, psz) != 0 )
2362             {
2363                 // we didn't obtain the same thing we started from, hence
2364                 // the conversion was lossy and we consider that it failed
2365                 return wxCONV_FAILED;
2366             }
2367         }
2368
2369         // note that it returns count of written chars for buf != NULL and size
2370         // of the needed buffer for buf == NULL so in either case the length of
2371         // the string (which never includes the terminating NUL) is one less
2372         return len - 1;
2373     }
2374
2375     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2376     {
2377         /*
2378             we have a problem here: by default, WideCharToMultiByte() may
2379             replace characters unrepresentable in the target code page with bad
2380             quality approximations such as turning "1/2" symbol (U+00BD) into
2381             "1" for the code pages which don't have it and we, obviously, want
2382             to avoid this at any price
2383
2384             the trouble is that this function does it _silently_, i.e. it won't
2385             even tell us whether it did or not... Win98/2000 and higher provide
2386             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2387             we have to resort to a round trip, i.e. check that converting back
2388             results in the same string -- this is, of course, expensive but
2389             otherwise we simply can't be sure to not garble the data.
2390          */
2391
2392         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2393         // it doesn't work with CJK encodings (which we test for rather roughly
2394         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2395         // supporting it
2396         BOOL usedDef wxDUMMY_INITIALIZE(false);
2397         BOOL *pUsedDef;
2398         int flags;
2399         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2400         {
2401             // it's our lucky day
2402             flags = WC_NO_BEST_FIT_CHARS;
2403             pUsedDef = &usedDef;
2404         }
2405         else // old system or unsupported encoding
2406         {
2407             flags = 0;
2408             pUsedDef = NULL;
2409         }
2410
2411         const size_t len = ::WideCharToMultiByte
2412                              (
2413                                 m_CodePage,     // code page
2414                                 flags,          // either none or no best fit
2415                                 pwz,            // input string
2416                                 -1,             // it is (wide) NUL-terminated
2417                                 buf,            // output buffer
2418                                 buf ? n : 0,    // and its size
2419                                 NULL,           // default "replacement" char
2420                                 pUsedDef        // [out] was it used?
2421                              );
2422
2423         if ( !len )
2424         {
2425             // function totally failed
2426             return wxCONV_FAILED;
2427         }
2428
2429         // if we were really converting, check if we succeeded
2430         if ( buf )
2431         {
2432             if ( flags )
2433             {
2434                 // check if the conversion failed, i.e. if any replacements
2435                 // were done
2436                 if ( usedDef )
2437                     return wxCONV_FAILED;
2438             }
2439             else // we must resort to double tripping...
2440             {
2441                 wxWCharBuffer wcBuf(n);
2442                 if ( MB2WC(wcBuf.data(), buf, n) == wxCONV_FAILED ||
2443                         wcscmp(wcBuf, pwz) != 0 )
2444                 {
2445                     // we didn't obtain the same thing we started from, hence
2446                     // the conversion was lossy and we consider that it failed
2447                     return wxCONV_FAILED;
2448                 }
2449             }
2450         }
2451
2452         // see the comment above for the reason of "len - 1"
2453         return len - 1;
2454     }
2455
2456     virtual size_t GetMBNulLen() const
2457     {
2458         if ( m_minMBCharWidth == 0 )
2459         {
2460             int len = ::WideCharToMultiByte
2461                         (
2462                             m_CodePage,     // code page
2463                             0,              // no flags
2464                             L"",            // input string
2465                             1,              // translate just the NUL
2466                             NULL,           // output buffer
2467                             0,              // and its size
2468                             NULL,           // no replacement char
2469                             NULL            // [out] don't care if it was used
2470                         );
2471
2472             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2473             switch ( len )
2474             {
2475                 default:
2476                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2477                     self->m_minMBCharWidth = (size_t)-1;
2478                     break;
2479
2480                 case 0:
2481                     self->m_minMBCharWidth = (size_t)-1;
2482                     break;
2483
2484                 case 1:
2485                 case 2:
2486                 case 4:
2487                     self->m_minMBCharWidth = len;
2488                     break;
2489             }
2490         }
2491
2492         return m_minMBCharWidth;
2493     }
2494
2495     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2496
2497     bool IsOk() const { return m_CodePage != -1; }
2498
2499 private:
2500     static bool CanUseNoBestFit()
2501     {
2502         static int s_isWin98Or2k = -1;
2503
2504         if ( s_isWin98Or2k == -1 )
2505         {
2506             int verMaj, verMin;
2507             switch ( wxGetOsVersion(&verMaj, &verMin) )
2508             {
2509                 case wxOS_WINDOWS_9X:
2510                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2511                     break;
2512
2513                 case wxOS_WINDOWS_NT:
2514                     s_isWin98Or2k = verMaj >= 5;
2515                     break;
2516
2517                 default:
2518                     // unknown: be conservative by default
2519                     s_isWin98Or2k = 0;
2520                     break;
2521             }
2522
2523             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2524         }
2525
2526         return s_isWin98Or2k == 1;
2527     }
2528
2529     static bool IsAtLeastWin2kSP4()
2530     {
2531 #ifdef __WXWINCE__
2532         return false;
2533 #else
2534         static int s_isAtLeastWin2kSP4 = -1;
2535
2536         if ( s_isAtLeastWin2kSP4 == -1 )
2537         {
2538             OSVERSIONINFOEX ver;
2539
2540             memset(&ver, 0, sizeof(ver));
2541             ver.dwOSVersionInfoSize = sizeof(ver);
2542             GetVersionEx((OSVERSIONINFO*)&ver);
2543
2544             s_isAtLeastWin2kSP4 =
2545               ((ver.dwMajorVersion > 5) || // Vista+
2546                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2547                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2548                ver.wServicePackMajor >= 4)) // 2000 SP4+
2549               ? 1 : 0;
2550         }
2551
2552         return s_isAtLeastWin2kSP4 == 1;
2553 #endif
2554     }
2555
2556
2557     // the code page we're working with
2558     long m_CodePage;
2559
2560     // cached result of GetMBNulLen(), set to 0 initially meaning
2561     // "unknown"
2562     size_t m_minMBCharWidth;
2563 };
2564
2565 #endif // wxHAVE_WIN32_MB2WC
2566
2567
2568 // ============================================================================
2569 // wxEncodingConverter based conversion classes
2570 // ============================================================================
2571
2572 #if wxUSE_FONTMAP
2573
2574 class wxMBConv_wxwin : public wxMBConv
2575 {
2576 private:
2577     void Init()
2578     {
2579         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2580         // The wxMBConv_cf class does a better job.
2581         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2582                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2583                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2584     }
2585
2586 public:
2587     // temporarily just use wxEncodingConverter stuff,
2588     // so that it works while a better implementation is built
2589     wxMBConv_wxwin(const char* name)
2590     {
2591         if (name)
2592             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2593         else
2594             m_enc = wxFONTENCODING_SYSTEM;
2595
2596         Init();
2597     }
2598
2599     wxMBConv_wxwin(wxFontEncoding enc)
2600     {
2601         m_enc = enc;
2602
2603         Init();
2604     }
2605
2606     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2607     {
2608         size_t inbuf = strlen(psz);
2609         if (buf)
2610         {
2611             if (!m2w.Convert(psz, buf))
2612                 return wxCONV_FAILED;
2613         }
2614         return inbuf;
2615     }
2616
2617     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2618     {
2619         const size_t inbuf = wxWcslen(psz);
2620         if (buf)
2621         {
2622             if (!w2m.Convert(psz, buf))
2623                 return wxCONV_FAILED;
2624         }
2625
2626         return inbuf;
2627     }
2628
2629     virtual size_t GetMBNulLen() const
2630     {
2631         switch ( m_enc )
2632         {
2633             case wxFONTENCODING_UTF16BE:
2634             case wxFONTENCODING_UTF16LE:
2635                 return 2;
2636
2637             case wxFONTENCODING_UTF32BE:
2638             case wxFONTENCODING_UTF32LE:
2639                 return 4;
2640
2641             default:
2642                 return 1;
2643         }
2644     }
2645
2646     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2647
2648     bool IsOk() const { return m_ok; }
2649
2650 public:
2651     wxFontEncoding m_enc;
2652     wxEncodingConverter m2w, w2m;
2653
2654 private:
2655     // were we initialized successfully?
2656     bool m_ok;
2657
2658     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2659 };
2660
2661 // make the constructors available for unit testing
2662 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2663 {
2664     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2665     if ( !result->IsOk() )
2666     {
2667         delete result;
2668         return 0;
2669     }
2670
2671     return result;
2672 }
2673
2674 #endif // wxUSE_FONTMAP
2675
2676 // ============================================================================
2677 // wxCSConv implementation
2678 // ============================================================================
2679
2680 void wxCSConv::Init()
2681 {
2682     m_name = NULL;
2683     m_convReal =  NULL;
2684     m_deferred = true;
2685 }
2686
2687 wxCSConv::wxCSConv(const wxString& charset)
2688 {
2689     Init();
2690
2691     if ( !charset.empty() )
2692     {
2693         SetName(charset.ToAscii());
2694     }
2695
2696 #if wxUSE_FONTMAP
2697     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2698 #else
2699     m_encoding = wxFONTENCODING_SYSTEM;
2700 #endif
2701 }
2702
2703 wxCSConv::wxCSConv(wxFontEncoding encoding)
2704 {
2705     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2706     {
2707         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2708
2709         encoding = wxFONTENCODING_SYSTEM;
2710     }
2711
2712     Init();
2713
2714     m_encoding = encoding;
2715 }
2716
2717 wxCSConv::~wxCSConv()
2718 {
2719     Clear();
2720 }
2721
2722 wxCSConv::wxCSConv(const wxCSConv& conv)
2723         : wxMBConv()
2724 {
2725     Init();
2726
2727     SetName(conv.m_name);
2728     m_encoding = conv.m_encoding;
2729 }
2730
2731 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2732 {
2733     Clear();
2734
2735     SetName(conv.m_name);
2736     m_encoding = conv.m_encoding;
2737
2738     return *this;
2739 }
2740
2741 void wxCSConv::Clear()
2742 {
2743     free(m_name);
2744     delete m_convReal;
2745
2746     m_name = NULL;
2747     m_convReal = NULL;
2748 }
2749
2750 void wxCSConv::SetName(const char *charset)
2751 {
2752     if (charset)
2753     {
2754         m_name = wxStrdup(charset);
2755         m_deferred = true;
2756     }
2757 }
2758
2759 #if wxUSE_FONTMAP
2760
2761 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2762                      wxEncodingNameCache );
2763
2764 static wxEncodingNameCache gs_nameCache;
2765 #endif
2766
2767 wxMBConv *wxCSConv::DoCreate() const
2768 {
2769 #if wxUSE_FONTMAP
2770     wxLogTrace(TRACE_STRCONV,
2771                wxT("creating conversion for %s"),
2772                (m_name ? m_name
2773                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2774 #endif // wxUSE_FONTMAP
2775
2776     // check for the special case of ASCII or ISO8859-1 charset: as we have
2777     // special knowledge of it anyhow, we don't need to create a special
2778     // conversion object
2779     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2780             m_encoding == wxFONTENCODING_DEFAULT )
2781     {
2782         // don't convert at all
2783         return NULL;
2784     }
2785
2786     // we trust OS to do conversion better than we can so try external
2787     // conversion methods first
2788     //
2789     // the full order is:
2790     //      1. OS conversion (iconv() under Unix or Win32 API)
2791     //      2. hard coded conversions for UTF
2792     //      3. wxEncodingConverter as fall back
2793
2794     // step (1)
2795 #ifdef HAVE_ICONV
2796 #if !wxUSE_FONTMAP
2797     if ( m_name )
2798 #endif // !wxUSE_FONTMAP
2799     {
2800 #if wxUSE_FONTMAP
2801         wxFontEncoding encoding(m_encoding);
2802 #endif
2803
2804         if ( m_name )
2805         {
2806             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2807             if ( conv->IsOk() )
2808                 return conv;
2809
2810             delete conv;
2811
2812 #if wxUSE_FONTMAP
2813             encoding =
2814                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2815 #endif // wxUSE_FONTMAP
2816         }
2817 #if wxUSE_FONTMAP
2818         {
2819             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2820             if ( it != gs_nameCache.end() )
2821             {
2822                 if ( it->second.empty() )
2823                     return NULL;
2824
2825                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
2826                 if ( conv->IsOk() )
2827                     return conv;
2828
2829                 delete conv;
2830             }
2831
2832             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
2833             // CS : in case this does not return valid names (eg for MacRoman)
2834             // encoding got a 'failure' entry in the cache all the same,
2835             // although it just has to be created using a different method, so
2836             // only store failed iconv creation attempts (or perhaps we
2837             // shoulnd't do this at all ?)
2838             if ( names[0] != NULL )
2839             {
2840                 for ( ; *names; ++names )
2841                 {
2842                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
2843                     //             will need changes that will obsolete this
2844                     wxString name(*names);
2845                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
2846                     if ( conv->IsOk() )
2847                     {
2848                         gs_nameCache[encoding] = *names;
2849                         return conv;
2850                     }
2851
2852                     delete conv;
2853                 }
2854
2855                 gs_nameCache[encoding] = _T(""); // cache the failure
2856             }
2857         }
2858 #endif // wxUSE_FONTMAP
2859     }
2860 #endif // HAVE_ICONV
2861
2862 #ifdef wxHAVE_WIN32_MB2WC
2863     {
2864 #if wxUSE_FONTMAP
2865         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2866                                       : new wxMBConv_win32(m_encoding);
2867         if ( conv->IsOk() )
2868             return conv;
2869
2870         delete conv;
2871 #else
2872         return NULL;
2873 #endif
2874     }
2875 #endif // wxHAVE_WIN32_MB2WC
2876
2877 #ifdef __DARWIN__
2878     {
2879         // leave UTF16 and UTF32 to the built-ins of wx
2880         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2881             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2882         {
2883 #if wxUSE_FONTMAP
2884             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
2885                                           : new wxMBConv_cf(m_encoding);
2886 #else
2887             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
2888 #endif
2889
2890             if ( conv->IsOk() )
2891                  return conv;
2892
2893             delete conv;
2894         }
2895     }
2896 #endif // __DARWIN__
2897
2898     // step (2)
2899     wxFontEncoding enc = m_encoding;
2900 #if wxUSE_FONTMAP
2901     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2902     {
2903         // use "false" to suppress interactive dialogs -- we can be called from
2904         // anywhere and popping up a dialog from here is the last thing we want to
2905         // do
2906         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2907     }
2908 #endif // wxUSE_FONTMAP
2909
2910     switch ( enc )
2911     {
2912         case wxFONTENCODING_UTF7:
2913              return new wxMBConvUTF7;
2914
2915         case wxFONTENCODING_UTF8:
2916              return new wxMBConvUTF8;
2917
2918         case wxFONTENCODING_UTF16BE:
2919              return new wxMBConvUTF16BE;
2920
2921         case wxFONTENCODING_UTF16LE:
2922              return new wxMBConvUTF16LE;
2923
2924         case wxFONTENCODING_UTF32BE:
2925              return new wxMBConvUTF32BE;
2926
2927         case wxFONTENCODING_UTF32LE:
2928              return new wxMBConvUTF32LE;
2929
2930         default:
2931              // nothing to do but put here to suppress gcc warnings
2932              break;
2933     }
2934
2935     // step (3)
2936 #if wxUSE_FONTMAP
2937     {
2938         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2939                                       : new wxMBConv_wxwin(m_encoding);
2940         if ( conv->IsOk() )
2941             return conv;
2942
2943         delete conv;
2944     }
2945 #endif // wxUSE_FONTMAP
2946
2947     // NB: This is a hack to prevent deadlock. What could otherwise happen
2948     //     in Unicode build: wxConvLocal creation ends up being here
2949     //     because of some failure and logs the error. But wxLog will try to
2950     //     attach a timestamp, for which it will need wxConvLocal (to convert
2951     //     time to char* and then wchar_t*), but that fails, tries to log the
2952     //     error, but wxLog has an (already locked) critical section that
2953     //     guards the static buffer.
2954     static bool alreadyLoggingError = false;
2955     if (!alreadyLoggingError)
2956     {
2957         alreadyLoggingError = true;
2958         wxLogError(_("Cannot convert from the charset '%s'!"),
2959                    m_name ? m_name
2960                       :
2961 #if wxUSE_FONTMAP
2962                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
2963 #else // !wxUSE_FONTMAP
2964                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
2965 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2966               );
2967
2968         alreadyLoggingError = false;
2969     }
2970
2971     return NULL;
2972 }
2973
2974 void wxCSConv::CreateConvIfNeeded() const
2975 {
2976     if ( m_deferred )
2977     {
2978         wxCSConv *self = (wxCSConv *)this; // const_cast
2979
2980         // if we don't have neither the name nor the encoding, use the default
2981         // encoding for this system
2982         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2983         {
2984 #if wxUSE_INTL
2985             self->m_encoding = wxLocale::GetSystemEncoding();
2986 #else
2987             // fallback to some reasonable default:
2988             self->m_encoding = wxFONTENCODING_ISO8859_1;
2989 #endif // wxUSE_INTL
2990         }
2991
2992         self->m_convReal = DoCreate();
2993         self->m_deferred = false;
2994     }
2995 }
2996
2997 bool wxCSConv::IsOk() const
2998 {
2999     CreateConvIfNeeded();
3000
3001     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3002     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3003         return true; // always ok as we do it ourselves
3004
3005     // m_convReal->IsOk() is called at its own creation, so we know it must
3006     // be ok if m_convReal is non-NULL
3007     return m_convReal != NULL;
3008 }
3009
3010 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3011                          const char *src, size_t srcLen) const
3012 {
3013     CreateConvIfNeeded();
3014
3015     if (m_convReal)
3016         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3017
3018     // latin-1 (direct)
3019     return wxMBConv::ToWChar(dst, dstLen, src, srcLen);
3020 }
3021
3022 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3023                            const wchar_t *src, size_t srcLen) const
3024 {
3025     CreateConvIfNeeded();
3026
3027     if (m_convReal)
3028         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3029
3030     // latin-1 (direct)
3031     return wxMBConv::FromWChar(dst, dstLen, src, srcLen);
3032 }
3033
3034 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
3035 {
3036     CreateConvIfNeeded();
3037
3038     if (m_convReal)
3039         return m_convReal->MB2WC(buf, psz, n);
3040
3041     // latin-1 (direct)
3042     size_t len = strlen(psz);
3043
3044     if (buf)
3045     {
3046         for (size_t c = 0; c <= len; c++)
3047             buf[c] = (unsigned char)(psz[c]);
3048     }
3049
3050     return len;
3051 }
3052
3053 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
3054 {
3055     CreateConvIfNeeded();
3056
3057     if (m_convReal)
3058         return m_convReal->WC2MB(buf, psz, n);
3059
3060     // latin-1 (direct)
3061     const size_t len = wxWcslen(psz);
3062     if (buf)
3063     {
3064         for (size_t c = 0; c <= len; c++)
3065         {
3066             if (psz[c] > 0xFF)
3067                 return wxCONV_FAILED;
3068
3069             buf[c] = (char)psz[c];
3070         }
3071     }
3072     else
3073     {
3074         for (size_t c = 0; c <= len; c++)
3075         {
3076             if (psz[c] > 0xFF)
3077                 return wxCONV_FAILED;
3078         }
3079     }
3080
3081     return len;
3082 }
3083
3084 size_t wxCSConv::GetMBNulLen() const
3085 {
3086     CreateConvIfNeeded();
3087
3088     if ( m_convReal )
3089     {
3090         return m_convReal->GetMBNulLen();
3091     }
3092
3093     // otherwise, we are ISO-8859-1
3094     return 1;
3095 }
3096
3097 #if wxUSE_UNICODE_UTF8
3098 bool wxCSConv::IsUTF8() const
3099 {
3100     CreateConvIfNeeded();
3101
3102     if ( m_convReal )
3103     {
3104         return m_convReal->IsUTF8();
3105     }
3106
3107     // otherwise, we are ISO-8859-1
3108     return false;
3109 }
3110 #endif
3111
3112
3113 #if wxUSE_UNICODE
3114
3115 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3116 {
3117     if ( !s )
3118         return wxWCharBuffer();
3119
3120     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3121     if ( !wbuf )
3122         wbuf = wxMBConvUTF8().cMB2WX(s);
3123     if ( !wbuf )
3124         wbuf = wxConvISO8859_1.cMB2WX(s);
3125
3126     return wbuf;
3127 }
3128
3129 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3130 {
3131     if ( !ws )
3132         return wxCharBuffer();
3133
3134     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3135     if ( !buf )
3136         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3137
3138     return buf;
3139 }
3140
3141 #endif // wxUSE_UNICODE
3142
3143 // ----------------------------------------------------------------------------
3144 // globals
3145 // ----------------------------------------------------------------------------
3146
3147 // NB: The reason why we create converted objects in this convoluted way,
3148 //     using a factory function instead of global variable, is that they
3149 //     may be used at static initialization time (some of them are used by
3150 //     wxString ctors and there may be a global wxString object). In other
3151 //     words, possibly _before_ the converter global object would be
3152 //     initialized.
3153
3154 #undef wxConvLibc
3155 #undef wxConvUTF8
3156 #undef wxConvUTF7
3157 #undef wxConvLocal
3158 #undef wxConvISO8859_1
3159
3160 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3161     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3162     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3163     {                                                                   \
3164         static impl_klass name##Obj ctor_args;                          \
3165         return &name##Obj;                                              \
3166     }                                                                   \
3167     /* this ensures that all global converter objects are created */    \
3168     /* by the time static initialization is done, i.e. before any */    \
3169     /* thread is launched: */                                           \
3170     static klass* gs_##name##instance = wxGet_##name##Ptr()
3171
3172 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3173     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3174
3175 #ifdef __WINDOWS__
3176     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3177 #else
3178     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3179 #endif
3180
3181 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, wxEMPTY_PARAMETER_VALUE);
3182 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, wxEMPTY_PARAMETER_VALUE);
3183
3184 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3185 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3186
3187 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3188 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3189
3190 #ifdef __DARWIN__
3191 // The xnu kernel always communicates file paths in decomposed UTF-8.
3192 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3193 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3194 #endif
3195
3196 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3197 #ifdef __DARWIN__
3198                                     &wxConvMacUTF8DObj;
3199 #else // !__DARWIN__
3200                                     wxGet_wxConvLibcPtr();
3201 #endif // __DARWIN__/!__DARWIN__
3202
3203 #else // !wxUSE_WCHAR_T
3204
3205 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3206 // stand-ins in absence of wchar_t
3207 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3208                                 wxConvISO8859_1,
3209                                 wxConvLocal,
3210                                 wxConvUTF8;
3211
3212 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T