src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/osx/core/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV wxT("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existing ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168     //
 169     // moreover, some conversion classes simply can't implement ToWChar()
 170     // directly, the primary example is wxConvLibc: mbstowcs() only handles
 171     // NUL-terminated strings
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     // the idea of this code is straightforward: it converts a NUL-terminated
 213     // chunk of the string during each iteration and updates the output buffer
 214     // with the result
 215     //
 216     // all the complication come from the fact that this function, for
 217     // historical reasons, must behave in 2 subtly different ways when it's
 218     // called with a fixed number of characters and when it's called for the
 219     // entire NUL-terminated string: in the former case (srcEnd != NULL) we
 220     // must count all characters we convert, NUL or not; but in the latter we
 221     // do not count the trailing NUL -- but still count all the NULs inside the
 222     // string
 223     //
 224     // so for the (simple) former case we just always count the trailing NUL,
 225     // but for the latter we need to wait until we see if there is going to be
 226     // another loop iteration and only count it then
 227     for ( ;; )
 228     {
 229         // try to convert the current chunk
 230         size_t lenChunk = MB2WC(NULL, src, 0);
 231         if ( lenChunk == wxCONV_FAILED )
 232             return wxCONV_FAILED;
 233
 234         dstWritten += lenChunk;
 235         if ( !srcEnd )
 236             dstWritten++;
 237
 238         if ( !lenChunk )
 239         {
 240             // nothing left in the input string, conversion succeeded
 241             break;
 242         }
 243
 244         if ( dst )
 245         {
 246             if ( dstWritten > dstLen )
 247                 return wxCONV_FAILED;
 248
 249             // +1 is for trailing NUL
 250             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 251                 return wxCONV_FAILED;
 252
 253             dst += lenChunk;
 254             if ( !srcEnd )
 255                 dst++;
 256         }
 257
 258         if ( !srcEnd )
 259         {
 260             // we convert just one chunk in this case as this is the entire
 261             // string anyhow (and we don't count the trailing NUL in this case)
 262             break;
 263         }
 264
 265         // advance the input pointer past the end of this chunk: notice that we
 266         // will always stop before srcEnd because we know that the chunk is
 267         // always properly NUL-terminated
 268         while ( NotAllNULs(src, nulLen) )
 269         {
 270             // notice that we must skip over multiple bytes here as we suppose
 271             // that if NUL takes 2 or 4 bytes, then all the other characters do
 272             // too and so if advanced by a single byte we might erroneously
 273             // detect sequences of NUL bytes in the middle of the input
 274             src += nulLen;
 275         }
 276
 277         // if the buffer ends before this NUL, we shouldn't count it in our
 278         // output so skip the code below
 279         if ( src == srcEnd )
 280             break;
 281
 282         // do count this terminator as it's inside the buffer we convert
 283         dstWritten++;
 284         if ( dst )
 285             dst++;
 286
 287         src += nulLen; // skip the terminator itself
 288
 289         if ( src >= srcEnd )
 290             break;
 291     }
 292
 293     return dstWritten;
 294 }
 295
 296 size_t
 297 wxMBConv::FromWChar(char *dst, size_t dstLen,
 298                     const wchar_t *src, size_t srcLen) const
 299 {
 300     // the number of chars [which would be] written to dst [if it were not NULL]
 301     size_t dstWritten = 0;
 302
 303     // if we don't know its length we have no choice but to assume that it is
 304     // NUL-terminated (notice that it can still be NUL-terminated even if
 305     // explicit length is given but it doesn't change our return value)
 306     const bool isNulTerminated = srcLen == wxNO_LEN;
 307
 308     // make a copy of the input string unless it is already properly
 309     // NUL-terminated
 310     wxWCharBuffer bufTmp;
 311     if ( isNulTerminated )
 312     {
 313         srcLen = wxWcslen(src) + 1;
 314     }
 315     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 316     {
 317         // make a copy in order to properly NUL-terminate the string
 318         bufTmp = wxWCharBuffer(srcLen);
 319         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 320         src = bufTmp;
 321     }
 322
 323     const size_t lenNul = GetMBNulLen();
 324     for ( const wchar_t * const srcEnd = src + srcLen;
 325           src < srcEnd;
 326           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 327     {
 328         // try to convert the current chunk
 329         size_t lenChunk = WC2MB(NULL, src, 0);
 330
 331         if ( lenChunk == wxCONV_FAILED )
 332             return wxCONV_FAILED;
 333
 334         dstWritten += lenChunk;
 335         if ( src + lenChunk < srcEnd || isNulTerminated )
 336             dstWritten += lenNul;
 337
 338         if ( dst )
 339         {
 340             if ( dstWritten > dstLen )
 341                 return wxCONV_FAILED;
 342
 343             if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
 344                 return wxCONV_FAILED;
 345
 346             dst += lenChunk;
 347             if ( src + lenChunk < srcEnd || isNulTerminated )
 348                 dst += lenNul;
 349         }
 350     }
 351
 352     return dstWritten;
 353 }
 354
 355 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 356 {
 357     size_t rc = ToWChar(outBuff, outLen, inBuff);
 358     if ( rc != wxCONV_FAILED )
 359     {
 360         // ToWChar() returns the buffer length, i.e. including the trailing
 361         // NUL, while this method doesn't take it into account
 362         rc--;
 363     }
 364
 365     return rc;
 366 }
 367
 368 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 369 {
 370     size_t rc = FromWChar(outBuff, outLen, inBuff);
 371     if ( rc != wxCONV_FAILED )
 372     {
 373         rc -= GetMBNulLen();
 374     }
 375
 376     return rc;
 377 }
 378
 379 wxMBConv::~wxMBConv()
 380 {
 381     // nothing to do here (necessary for Darwin linking probably)
 382 }
 383
 384 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 385 {
 386     if ( psz )
 387     {
 388         // calculate the length of the buffer needed first
 389         const size_t nLen = ToWChar(NULL, 0, psz);
 390         if ( nLen != wxCONV_FAILED )
 391         {
 392             // now do the actual conversion
 393             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 394
 395             // +1 for the trailing NULL
 396             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 397                 return buf;
 398         }
 399     }
 400
 401     return wxWCharBuffer();
 402 }
 403
 404 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 405 {
 406     if ( pwz )
 407     {
 408         const size_t nLen = FromWChar(NULL, 0, pwz);
 409         if ( nLen != wxCONV_FAILED )
 410         {
 411             wxCharBuffer buf(nLen - 1);
 412             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 413                 return buf;
 414         }
 415     }
 416
 417     return wxCharBuffer();
 418 }
 419
 420 const wxWCharBuffer
 421 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 422 {
 423     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 424     if ( dstLen != wxCONV_FAILED )
 425     {
 426         // notice that we allocate space for dstLen+1 wide characters here
 427         // because we want the buffer to always be NUL-terminated, even if the
 428         // input isn't (as otherwise the caller has no way to know its length)
 429         wxWCharBuffer wbuf(dstLen);
 430         wbuf.data()[dstLen] = L'\0';
 431         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 432         {
 433             if ( outLen )
 434             {
 435                 *outLen = dstLen;
 436
 437                 // we also need to handle NUL-terminated input strings
 438                 // specially: for them the output is the length of the string
 439                 // excluding the trailing NUL, however if we're asked to
 440                 // convert a specific number of characters we return the length
 441                 // of the resulting output even if it's NUL-terminated
 442                 if ( inLen == wxNO_LEN )
 443                     (*outLen)--;
 444             }
 445
 446             return wbuf;
 447         }
 448     }
 449
 450     if ( outLen )
 451         *outLen = 0;
 452
 453     return wxWCharBuffer();
 454 }
 455
 456 const wxCharBuffer
 457 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 458 {
 459     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 460     if ( dstLen != wxCONV_FAILED )
 461     {
 462         const size_t nulLen = GetMBNulLen();
 463
 464         // as above, ensure that the buffer is always NUL-terminated, even if
 465         // the input is not
 466         wxCharBuffer buf(dstLen + nulLen - 1);
 467         memset(buf.data() + dstLen, 0, nulLen);
 468         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 469         {
 470             if ( outLen )
 471             {
 472                 *outLen = dstLen;
 473
 474                 if ( inLen == wxNO_LEN )
 475                 {
 476                     // in this case both input and output are NUL-terminated
 477                     // and we're not supposed to count NUL
 478                     *outLen -= nulLen;
 479                 }
 480             }
 481
 482             return buf;
 483         }
 484     }
 485
 486     if ( outLen )
 487         *outLen = 0;
 488
 489     return wxCharBuffer();
 490 }
 491
 492 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
 493 {
 494     const size_t srcLen = buf.length();
 495     if ( srcLen )
 496     {
 497         const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
 498         if ( dstLen != wxCONV_FAILED )
 499         {
 500             wxWCharBuffer wbuf(dstLen);
 501             wbuf.data()[dstLen] = L'\0';
 502             if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
 503                 return wbuf;
 504         }
 505     }
 506
 507     return wxWCharBuffer();
 508 }
 509
 510 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
 511 {
 512     const size_t srcLen = wbuf.length();
 513     if ( srcLen )
 514     {
 515         const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
 516         if ( dstLen != wxCONV_FAILED )
 517         {
 518             wxCharBuffer buf(dstLen);
 519             buf.data()[dstLen] = '\0';
 520             if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
 521                 return buf;
 522         }
 523     }
 524
 525     return wxCharBuffer();
 526 }
 527
 528 // ----------------------------------------------------------------------------
 529 // wxMBConvLibc
 530 // ----------------------------------------------------------------------------
 531
 532 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 533 {
 534     return wxMB2WC(buf, psz, n);
 535 }
 536
 537 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 538 {
 539     return wxWC2MB(buf, psz, n);
 540 }
 541
 542 // ----------------------------------------------------------------------------
 543 // wxConvBrokenFileNames
 544 // ----------------------------------------------------------------------------
 545
 546 #ifdef __UNIX__
 547
 548 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 549 {
 550     if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
 551          wxStricmp(charset, wxT("UTF8")) == 0  )
 552         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 553     else
 554         m_conv = new wxCSConv(charset);
 555 }
 556
 557 #endif // __UNIX__
 558
 559 // ----------------------------------------------------------------------------
 560 // UTF-7
 561 // ----------------------------------------------------------------------------
 562
 563 // Implementation (C) 2004 Fredrik Roubert
 564 //
 565 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 566
 567 //
 568 // BASE64 decoding table
 569 //
 570 static const unsigned char utf7unb64[] =
 571 {
 572     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 573     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 574     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 575     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 576     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 577     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 578     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 579     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 580     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 581     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 582     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 583     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 584     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 585     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 586     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 587     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 588     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 589     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 590     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 591     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 592     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 593     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 594     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 595     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 596     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 597     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 598     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 599     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 600     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 601     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 602     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 603     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 604 };
 605
 606 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 607                              const char *src, size_t srcLen) const
 608 {
 609     DecoderState stateOrig,
 610                 *statePtr;
 611     if ( srcLen == wxNO_LEN )
 612     {
 613         // convert the entire string, up to and including the trailing NUL
 614         srcLen = strlen(src) + 1;
 615
 616         // when working on the entire strings we don't update nor use the shift
 617         // state from the previous call
 618         statePtr = &stateOrig;
 619     }
 620     else // when working with partial strings we do use the shift state
 621     {
 622         statePtr = const_cast<DecoderState *>(&m_stateDecoder);
 623
 624         // also save the old state to be able to rollback to it on error
 625         stateOrig = m_stateDecoder;
 626     }
 627
 628     // but to simplify the code below we use this variable in both cases
 629     DecoderState& state = *statePtr;
 630
 631
 632     // number of characters [which would have been] written to dst [if it were
 633     // not NULL]
 634     size_t len = 0;
 635
 636     const char * const srcEnd = src + srcLen;
 637
 638     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 639     {
 640         const unsigned char cc = *src++;
 641
 642         if ( state.IsShifted() )
 643         {
 644             const unsigned char dc = utf7unb64[cc];
 645             if ( dc == 0xff )
 646             {
 647                 // end of encoded part, check that nothing was left: there can
 648                 // be up to 4 bits of 0 padding but nothing else (we also need
 649                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
 650                 // encoded sequence must contain an integral number of UTF-16
 651                 // characters)
 652                 if ( state.isLSB || state.bit > 4 ||
 653                         (state.accum & ((1 << state.bit) - 1)) )
 654                 {
 655                     if ( !len )
 656                         state = stateOrig;
 657
 658                     return wxCONV_FAILED;
 659                 }
 660
 661                 state.ToDirect();
 662
 663                 // re-parse this character normally below unless it's '-' which
 664                 // is consumed by the decoder
 665                 if ( cc == '-' )
 666                     continue;
 667             }
 668             else // valid encoded character
 669             {
 670                 // mini base64 decoder: each character is 6 bits
 671                 state.bit += 6;
 672                 state.accum <<= 6;
 673                 state.accum += dc;
 674
 675                 if ( state.bit >= 8 )
 676                 {
 677                     // got the full byte, consume it
 678                     state.bit -= 8;
 679                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 680
 681                     if ( state.isLSB )
 682                     {
 683                         // we've got the full word, output it
 684                         if ( dst )
 685                             *dst++ = (state.msb << 8) | b;
 686                         len++;
 687                         state.isLSB = false;
 688                     }
 689                     else // MSB
 690                     {
 691                         // just store it while we wait for LSB
 692                         state.msb = b;
 693                         state.isLSB = true;
 694                     }
 695                 }
 696             }
 697         }
 698
 699         if ( state.IsDirect() )
 700         {
 701             // start of an encoded segment?
 702             if ( cc == '+' )
 703             {
 704                 if ( *src == '-' )
 705                 {
 706                     // just the encoded plus sign, don't switch to shifted mode
 707                     if ( dst )
 708                         *dst++ = '+';
 709                     len++;
 710                     src++;
 711                 }
 712                 else if ( utf7unb64[(unsigned)*src] == 0xff )
 713                 {
 714                     // empty encoded chunks are not allowed
 715                     if ( !len )
 716                         state = stateOrig;
 717
 718                     return wxCONV_FAILED;
 719                 }
 720                 else // base-64 encoded chunk follows
 721                 {
 722                     state.ToShifted();
 723                 }
 724             }
 725             else // not '+'
 726             {
 727                 // only printable 7 bit ASCII characters (with the exception of
 728                 // NUL, TAB, CR and LF) can be used directly
 729                 if ( cc >= 0x7f || (cc < ' ' &&
 730                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 731                     return wxCONV_FAILED;
 732
 733                 if ( dst )
 734                     *dst++ = cc;
 735                 len++;
 736             }
 737         }
 738     }
 739
 740     if ( !len )
 741     {
 742         // as we didn't read any characters we should be called with the same
 743         // data (followed by some more new data) again later so don't save our
 744         // state
 745         state = stateOrig;
 746
 747         return wxCONV_FAILED;
 748     }
 749
 750     return len;
 751 }
 752
 753 //
 754 // BASE64 encoding table
 755 //
 756 static const unsigned char utf7enb64[] =
 757 {
 758     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 759     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 760     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 761     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 762     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 763     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 764     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 765     '4', '5', '6', '7', '8', '9', '+', '/'
 766 };
 767
 768 //
 769 // UTF-7 encoding table
 770 //
 771 // 0 - Set D (directly encoded characters)
 772 // 1 - Set O (optional direct characters)
 773 // 2 - whitespace characters (optional)
 774 // 3 - special characters
 775 //
 776 static const unsigned char utf7encode[128] =
 777 {
 778     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 779     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 780     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 781     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 782     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 783     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 784     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 785     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 786 };
 787
 788 static inline bool wxIsUTF7Direct(wchar_t wc)
 789 {
 790     return wc < 0x80 && utf7encode[wc] < 1;
 791 }
 792
 793 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 794                                const wchar_t *src, size_t srcLen) const
 795 {
 796     EncoderState stateOrig,
 797                 *statePtr;
 798     if ( srcLen == wxNO_LEN )
 799     {
 800         // we don't apply the stored state when operating on entire strings at
 801         // once
 802         statePtr = &stateOrig;
 803
 804         srcLen = wxWcslen(src) + 1;
 805     }
 806     else // do use the mode we left the output in previously
 807     {
 808         stateOrig = m_stateEncoder;
 809         statePtr = const_cast<EncoderState *>(&m_stateEncoder);
 810     }
 811
 812     EncoderState& state = *statePtr;
 813
 814
 815     size_t len = 0;
 816
 817     const wchar_t * const srcEnd = src + srcLen;
 818     while ( src < srcEnd && (!dst || len < dstLen) )
 819     {
 820         wchar_t cc = *src++;
 821         if ( wxIsUTF7Direct(cc) )
 822         {
 823             if ( state.IsShifted() )
 824             {
 825                 // pad with zeros the last encoded block if necessary
 826                 if ( state.bit )
 827                 {
 828                     if ( dst )
 829                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 830                     len++;
 831                 }
 832
 833                 state.ToDirect();
 834
 835                 if ( dst )
 836                     *dst++ = '-';
 837                 len++;
 838             }
 839
 840             if ( dst )
 841                 *dst++ = (char)cc;
 842             len++;
 843         }
 844         else if ( cc == '+' && state.IsDirect() )
 845         {
 846             if ( dst )
 847             {
 848                 *dst++ = '+';
 849                 *dst++ = '-';
 850             }
 851
 852             len += 2;
 853         }
 854 #ifndef WC_UTF16
 855         else if (((wxUint32)cc) > 0xffff)
 856         {
 857             // no surrogate pair generation (yet?)
 858             return wxCONV_FAILED;
 859         }
 860 #endif
 861         else
 862         {
 863             if ( state.IsDirect() )
 864             {
 865                 state.ToShifted();
 866
 867                 if ( dst )
 868                     *dst++ = '+';
 869                 len++;
 870             }
 871
 872             // BASE64 encode string
 873             for ( ;; )
 874             {
 875                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 876                 {
 877                     state.accum <<= 8;
 878                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 879
 880                     for (state.bit += 8; state.bit >= 6; )
 881                     {
 882                         state.bit -= 6;
 883                         if ( dst )
 884                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 885                         len++;
 886                     }
 887                 }
 888
 889                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 890                     break;
 891
 892                 src++;
 893             }
 894         }
 895     }
 896
 897     // we need to restore the original encoder state if we were called just to
 898     // calculate the amount of space needed as we will presumably be called
 899     // again to really convert the data now
 900     if ( !dst )
 901         state = stateOrig;
 902
 903     return len;
 904 }
 905
 906 // ----------------------------------------------------------------------------
 907 // UTF-8
 908 // ----------------------------------------------------------------------------
 909
 910 static const wxUint32 utf8_max[]=
 911     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 912
 913 // boundaries of the private use area we use to (temporarily) remap invalid
 914 // characters invalid in a UTF-8 encoded string
 915 const wxUint32 wxUnicodePUA = 0x100000;
 916 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 917
 918 // this table gives the length of the UTF-8 encoding from its first character:
 919 const unsigned char tableUtf8Lengths[256] = {
 920     // single-byte sequences (ASCII):
 921     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 922     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 923     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 924     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 925     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 926     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 927     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 928     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 929
 930     // these are invalid:
 931     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 932     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 933     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 934     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 935     0, 0,                                            // C0,C1
 936
 937     // two-byte sequences:
 938           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 939     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 940
 941     // three-byte sequences:
 942     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 943
 944     // four-byte sequences:
 945     4, 4, 4, 4, 4,                                   // F0..F4
 946
 947     // these are invalid again (5- or 6-byte
 948     // sequences and sequences for code points
 949     // above U+10FFFF, as restricted by RFC 3629):
 950                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 951 };
 952
 953 size_t
 954 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 955                             const char *src, size_t srcLen) const
 956 {
 957     wchar_t *out = dstLen ? dst : NULL;
 958     size_t written = 0;
 959
 960     if ( srcLen == wxNO_LEN )
 961         srcLen = strlen(src) + 1;
 962
 963     for ( const char *p = src; ; p++ )
 964     {
 965         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 966         {
 967             // all done successfully, just add the trailing NULL if we are not
 968             // using explicit length
 969             if ( srcLen == wxNO_LEN )
 970             {
 971                 if ( out )
 972                 {
 973                     if ( !dstLen )
 974                         break;
 975
 976                     *out = L'\0';
 977                 }
 978
 979                 written++;
 980             }
 981
 982             return written;
 983         }
 984
 985         if ( out && !dstLen-- )
 986             break;
 987
 988         wxUint32 code;
 989         unsigned char c = *p;
 990
 991         if ( c < 0x80 )
 992         {
 993             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 994                 break;
 995
 996             if ( srcLen != wxNO_LEN )
 997                 srcLen--;
 998
 999             code = c;
1000         }
1001         else
1002         {
1003             unsigned len = tableUtf8Lengths[c];
1004             if ( !len )
1005                 break;
1006
1007             if ( srcLen < len ) // the test works for wxNO_LEN too
1008                 break;
1009
1010             if ( srcLen != wxNO_LEN )
1011                 srcLen -= len;
1012
1013             //   Char. number range   |        UTF-8 octet sequence
1014             //      (hexadecimal)     |              (binary)
1015             //  ----------------------+----------------------------------------
1016             //  0000 0000 - 0000 007F | 0xxxxxxx
1017             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1018             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1019             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1020             //
1021             //  Code point value is stored in bits marked with 'x',
1022             //  lowest-order bit of the value on the right side in the diagram
1023             //  above.                                         (from RFC 3629)
1024
1025             // mask to extract lead byte's value ('x' bits above), by sequence
1026             // length:
1027             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1028
1029             // mask and value of lead byte's most significant bits, by length:
1030             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1031             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1032
1033             len--; // it's more convenient to work with 0-based length here
1034
1035             // extract the lead byte's value bits:
1036             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1037                 break;
1038
1039             code = c & leadValueMask[len];
1040
1041             // all remaining bytes, if any, are handled in the same way
1042             // regardless of sequence's length:
1043             for ( ; len; --len )
1044             {
1045                 c = *++p;
1046                 if ( (c & 0xC0) != 0x80 )
1047                     return wxCONV_FAILED;
1048
1049                 code <<= 6;
1050                 code |= c & 0x3F;
1051             }
1052         }
1053
1054 #ifdef WC_UTF16
1055         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1056         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1057         {
1058             if ( out )
1059                 out++;
1060             written++;
1061         }
1062 #else // !WC_UTF16
1063         if ( out )
1064             *out = code;
1065 #endif // WC_UTF16/!WC_UTF16
1066
1067         if ( out )
1068             out++;
1069
1070         written++;
1071     }
1072
1073     return wxCONV_FAILED;
1074 }
1075
1076 size_t
1077 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1078                               const wchar_t *src, size_t srcLen) const
1079 {
1080     char *out = dstLen ? dst : NULL;
1081     size_t written = 0;
1082
1083     for ( const wchar_t *wp = src; ; wp++ )
1084     {
1085         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1086         {
1087             // all done successfully, just add the trailing NULL if we are not
1088             // using explicit length
1089             if ( srcLen == wxNO_LEN )
1090             {
1091                 if ( out )
1092                 {
1093                     if ( !dstLen )
1094                         break;
1095
1096                     *out = '\0';
1097                 }
1098
1099                 written++;
1100             }
1101
1102             return written;
1103         }
1104
1105         if ( srcLen != wxNO_LEN )
1106             srcLen--;
1107
1108         wxUint32 code;
1109 #ifdef WC_UTF16
1110         // cast is ok for WC_UTF16
1111         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1112         {
1113             // skip the next char too as we decoded a surrogate
1114             wp++;
1115         }
1116 #else // wchar_t is UTF-32
1117         code = *wp & 0x7fffffff;
1118 #endif
1119
1120         unsigned len;
1121         if ( code <= 0x7F )
1122         {
1123             len = 1;
1124             if ( out )
1125             {
1126                 if ( dstLen < len )
1127                     break;
1128
1129                 out[0] = (char)code;
1130             }
1131         }
1132         else if ( code <= 0x07FF )
1133         {
1134             len = 2;
1135             if ( out )
1136             {
1137                 if ( dstLen < len )
1138                     break;
1139
1140                 // NB: this line takes 6 least significant bits, encodes them as
1141                 // 10xxxxxx and discards them so that the next byte can be encoded:
1142                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1143                 out[0] = 0xC0 | code;
1144             }
1145         }
1146         else if ( code < 0xFFFF )
1147         {
1148             len = 3;
1149             if ( out )
1150             {
1151                 if ( dstLen < len )
1152                     break;
1153
1154                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1155                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1156                 out[0] = 0xE0 | code;
1157             }
1158         }
1159         else if ( code <= 0x10FFFF )
1160         {
1161             len = 4;
1162             if ( out )
1163             {
1164                 if ( dstLen < len )
1165                     break;
1166
1167                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1168                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1169                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1170                 out[0] = 0xF0 | code;
1171             }
1172         }
1173         else
1174         {
1175             wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1176             break;
1177         }
1178
1179         if ( out )
1180         {
1181             out += len;
1182             dstLen -= len;
1183         }
1184
1185         written += len;
1186     }
1187
1188     // we only get here if an error occurs during decoding
1189     return wxCONV_FAILED;
1190 }
1191
1192 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1193                              const char *psz, size_t srcLen) const
1194 {
1195     if ( m_options == MAP_INVALID_UTF8_NOT )
1196         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1197
1198     size_t len = 0;
1199
1200     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1201     {
1202         const char *opsz = psz;
1203         bool invalid = false;
1204         unsigned char cc = *psz++, fc = cc;
1205         unsigned cnt;
1206         for (cnt = 0; fc & 0x80; cnt++)
1207             fc <<= 1;
1208
1209         if (!cnt)
1210         {
1211             // plain ASCII char
1212             if (buf)
1213                 *buf++ = cc;
1214             len++;
1215
1216             // escape the escape character for octal escapes
1217             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1218                     && cc == '\\' && (!buf || len < n))
1219             {
1220                 if (buf)
1221                     *buf++ = cc;
1222                 len++;
1223             }
1224         }
1225         else
1226         {
1227             cnt--;
1228             if (!cnt)
1229             {
1230                 // invalid UTF-8 sequence
1231                 invalid = true;
1232             }
1233             else
1234             {
1235                 unsigned ocnt = cnt - 1;
1236                 wxUint32 res = cc & (0x3f >> cnt);
1237                 while (cnt--)
1238                 {
1239                     cc = *psz;
1240                     if ((cc & 0xC0) != 0x80)
1241                     {
1242                         // invalid UTF-8 sequence
1243                         invalid = true;
1244                         break;
1245                     }
1246
1247                     psz++;
1248                     res = (res << 6) | (cc & 0x3f);
1249                 }
1250
1251                 if (invalid || res <= utf8_max[ocnt])
1252                 {
1253                     // illegal UTF-8 encoding
1254                     invalid = true;
1255                 }
1256                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1257                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1258                 {
1259                     // if one of our PUA characters turns up externally
1260                     // it must also be treated as an illegal sequence
1261                     // (a bit like you have to escape an escape character)
1262                     invalid = true;
1263                 }
1264                 else
1265                 {
1266 #ifdef WC_UTF16
1267                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1268                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1269                     if (pa == wxCONV_FAILED)
1270                     {
1271                         invalid = true;
1272                     }
1273                     else
1274                     {
1275                         if (buf)
1276                             buf += pa;
1277                         len += pa;
1278                     }
1279 #else // !WC_UTF16
1280                     if (buf)
1281                         *buf++ = (wchar_t)res;
1282                     len++;
1283 #endif // WC_UTF16/!WC_UTF16
1284                 }
1285             }
1286
1287             if (invalid)
1288             {
1289                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1290                 {
1291                     while (opsz < psz && (!buf || len < n))
1292                     {
1293 #ifdef WC_UTF16
1294                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1295                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1296                         wxASSERT(pa != wxCONV_FAILED);
1297                         if (buf)
1298                             buf += pa;
1299                         opsz++;
1300                         len += pa;
1301 #else
1302                         if (buf)
1303                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1304                         opsz++;
1305                         len++;
1306 #endif
1307                     }
1308                 }
1309                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1310                 {
1311                     while (opsz < psz && (!buf || len < n))
1312                     {
1313                         if ( buf && len + 3 < n )
1314                         {
1315                             unsigned char on = *opsz;
1316                             *buf++ = L'\\';
1317                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1318                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1319                             *buf++ = (wchar_t)( L'0' + on % 010 );
1320                         }
1321
1322                         opsz++;
1323                         len += 4;
1324                     }
1325                 }
1326                 else // MAP_INVALID_UTF8_NOT
1327                 {
1328                     return wxCONV_FAILED;
1329                 }
1330             }
1331         }
1332     }
1333
1334     if (srcLen == wxNO_LEN && buf && (len < n))
1335         *buf = 0;
1336
1337     return len + 1;
1338 }
1339
1340 static inline bool isoctal(wchar_t wch)
1341 {
1342     return L'0' <= wch && wch <= L'7';
1343 }
1344
1345 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1346                                const wchar_t *psz, size_t srcLen) const
1347 {
1348     if ( m_options == MAP_INVALID_UTF8_NOT )
1349         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1350
1351     size_t len = 0;
1352
1353     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1354     {
1355         wxUint32 cc;
1356
1357 #ifdef WC_UTF16
1358         // cast is ok for WC_UTF16
1359         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1360         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1361 #else
1362         cc = (*psz++) & 0x7fffffff;
1363 #endif
1364
1365         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1366                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1367         {
1368             if (buf)
1369                 *buf++ = (char)(cc - wxUnicodePUA);
1370             len++;
1371         }
1372         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1373                     && cc == L'\\' && psz[0] == L'\\' )
1374         {
1375             if (buf)
1376                 *buf++ = (char)cc;
1377             psz++;
1378             len++;
1379         }
1380         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1381                     cc == L'\\' &&
1382                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1383         {
1384             if (buf)
1385             {
1386                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1387                                  (psz[1] - L'0') * 010 +
1388                                  (psz[2] - L'0'));
1389             }
1390
1391             psz += 3;
1392             len++;
1393         }
1394         else
1395         {
1396             unsigned cnt;
1397             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1398             {
1399             }
1400
1401             if (!cnt)
1402             {
1403                 // plain ASCII char
1404                 if (buf)
1405                     *buf++ = (char) cc;
1406                 len++;
1407             }
1408             else
1409             {
1410                 len += cnt + 1;
1411                 if (buf)
1412                 {
1413                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1414                     while (cnt--)
1415                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1416                 }
1417             }
1418         }
1419     }
1420
1421     if (srcLen == wxNO_LEN && buf && (len < n))
1422         *buf = 0;
1423
1424     return len + 1;
1425 }
1426
1427 // ============================================================================
1428 // UTF-16
1429 // ============================================================================
1430
1431 #ifdef WORDS_BIGENDIAN
1432     #define wxMBConvUTF16straight wxMBConvUTF16BE
1433     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1434 #else
1435     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1436     #define wxMBConvUTF16straight wxMBConvUTF16LE
1437 #endif
1438
1439 /* static */
1440 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1441 {
1442     if ( srcLen == wxNO_LEN )
1443     {
1444         // count the number of bytes in input, including the trailing NULs
1445         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1446         for ( srcLen = 1; *inBuff++; srcLen++ )
1447             ;
1448
1449         srcLen *= BYTES_PER_CHAR;
1450     }
1451     else // we already have the length
1452     {
1453         // we can only convert an entire number of UTF-16 characters
1454         if ( srcLen % BYTES_PER_CHAR )
1455             return wxCONV_FAILED;
1456     }
1457
1458     return srcLen;
1459 }
1460
1461 // case when in-memory representation is UTF-16 too
1462 #ifdef WC_UTF16
1463
1464 // ----------------------------------------------------------------------------
1465 // conversions without endianness change
1466 // ----------------------------------------------------------------------------
1467
1468 size_t
1469 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1470                                const char *src, size_t srcLen) const
1471 {
1472     // set up the scene for using memcpy() (which is presumably more efficient
1473     // than copying the bytes one by one)
1474     srcLen = GetLength(src, srcLen);
1475     if ( srcLen == wxNO_LEN )
1476         return wxCONV_FAILED;
1477
1478     const size_t inLen = srcLen / BYTES_PER_CHAR;
1479     if ( dst )
1480     {
1481         if ( dstLen < inLen )
1482             return wxCONV_FAILED;
1483
1484         memcpy(dst, src, srcLen);
1485     }
1486
1487     return inLen;
1488 }
1489
1490 size_t
1491 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1492                                  const wchar_t *src, size_t srcLen) const
1493 {
1494     if ( srcLen == wxNO_LEN )
1495         srcLen = wxWcslen(src) + 1;
1496
1497     srcLen *= BYTES_PER_CHAR;
1498
1499     if ( dst )
1500     {
1501         if ( dstLen < srcLen )
1502             return wxCONV_FAILED;
1503
1504         memcpy(dst, src, srcLen);
1505     }
1506
1507     return srcLen;
1508 }
1509
1510 // ----------------------------------------------------------------------------
1511 // endian-reversing conversions
1512 // ----------------------------------------------------------------------------
1513
1514 size_t
1515 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1516                            const char *src, size_t srcLen) const
1517 {
1518     srcLen = GetLength(src, srcLen);
1519     if ( srcLen == wxNO_LEN )
1520         return wxCONV_FAILED;
1521
1522     srcLen /= BYTES_PER_CHAR;
1523
1524     if ( dst )
1525     {
1526         if ( dstLen < srcLen )
1527             return wxCONV_FAILED;
1528
1529         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1530         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1531         {
1532             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1533         }
1534     }
1535
1536     return srcLen;
1537 }
1538
1539 size_t
1540 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1541                              const wchar_t *src, size_t srcLen) const
1542 {
1543     if ( srcLen == wxNO_LEN )
1544         srcLen = wxWcslen(src) + 1;
1545
1546     srcLen *= BYTES_PER_CHAR;
1547
1548     if ( dst )
1549     {
1550         if ( dstLen < srcLen )
1551             return wxCONV_FAILED;
1552
1553         wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1554         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1555         {
1556             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1557         }
1558     }
1559
1560     return srcLen;
1561 }
1562
1563 #else // !WC_UTF16: wchar_t is UTF-32
1564
1565 // ----------------------------------------------------------------------------
1566 // conversions without endianness change
1567 // ----------------------------------------------------------------------------
1568
1569 size_t
1570 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1571                                const char *src, size_t srcLen) const
1572 {
1573     srcLen = GetLength(src, srcLen);
1574     if ( srcLen == wxNO_LEN )
1575         return wxCONV_FAILED;
1576
1577     const size_t inLen = srcLen / BYTES_PER_CHAR;
1578     if ( !dst )
1579     {
1580         // optimization: return maximal space which could be needed for this
1581         // string even if the real size could be smaller if the buffer contains
1582         // any surrogates
1583         return inLen;
1584     }
1585
1586     size_t outLen = 0;
1587     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1588     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1589     {
1590         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1591         if ( !inBuff )
1592             return wxCONV_FAILED;
1593
1594         if ( ++outLen > dstLen )
1595             return wxCONV_FAILED;
1596
1597         *dst++ = ch;
1598     }
1599
1600
1601     return outLen;
1602 }
1603
1604 size_t
1605 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1606                                  const wchar_t *src, size_t srcLen) const
1607 {
1608     if ( srcLen == wxNO_LEN )
1609         srcLen = wxWcslen(src) + 1;
1610
1611     size_t outLen = 0;
1612     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1613     for ( size_t n = 0; n < srcLen; n++ )
1614     {
1615         wxUint16 cc[2];
1616         const size_t numChars = encode_utf16(*src++, cc);
1617         if ( numChars == wxCONV_FAILED )
1618             return wxCONV_FAILED;
1619
1620         outLen += numChars * BYTES_PER_CHAR;
1621         if ( outBuff )
1622         {
1623             if ( outLen > dstLen )
1624                 return wxCONV_FAILED;
1625
1626             *outBuff++ = cc[0];
1627             if ( numChars == 2 )
1628             {
1629                 // second character of a surrogate
1630                 *outBuff++ = cc[1];
1631             }
1632         }
1633     }
1634
1635     return outLen;
1636 }
1637
1638 // ----------------------------------------------------------------------------
1639 // endian-reversing conversions
1640 // ----------------------------------------------------------------------------
1641
1642 size_t
1643 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1644                            const char *src, size_t srcLen) const
1645 {
1646     srcLen = GetLength(src, srcLen);
1647     if ( srcLen == wxNO_LEN )
1648         return wxCONV_FAILED;
1649
1650     const size_t inLen = srcLen / BYTES_PER_CHAR;
1651     if ( !dst )
1652     {
1653         // optimization: return maximal space which could be needed for this
1654         // string even if the real size could be smaller if the buffer contains
1655         // any surrogates
1656         return inLen;
1657     }
1658
1659     size_t outLen = 0;
1660     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1661     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1662     {
1663         wxUint32 ch;
1664         wxUint16 tmp[2];
1665
1666         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1667         inBuff++;
1668         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1669
1670         const size_t numChars = decode_utf16(tmp, ch);
1671         if ( numChars == wxCONV_FAILED )
1672             return wxCONV_FAILED;
1673
1674         if ( numChars == 2 )
1675             inBuff++;
1676
1677         if ( ++outLen > dstLen )
1678             return wxCONV_FAILED;
1679
1680         *dst++ = ch;
1681     }
1682
1683
1684     return outLen;
1685 }
1686
1687 size_t
1688 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1689                              const wchar_t *src, size_t srcLen) const
1690 {
1691     if ( srcLen == wxNO_LEN )
1692         srcLen = wxWcslen(src) + 1;
1693
1694     size_t outLen = 0;
1695     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1696     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1697     {
1698         wxUint16 cc[2];
1699         const size_t numChars = encode_utf16(*src, cc);
1700         if ( numChars == wxCONV_FAILED )
1701             return wxCONV_FAILED;
1702
1703         outLen += numChars * BYTES_PER_CHAR;
1704         if ( outBuff )
1705         {
1706             if ( outLen > dstLen )
1707                 return wxCONV_FAILED;
1708
1709             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1710             if ( numChars == 2 )
1711             {
1712                 // second character of a surrogate
1713                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1714             }
1715         }
1716     }
1717
1718     return outLen;
1719 }
1720
1721 #endif // WC_UTF16/!WC_UTF16
1722
1723
1724 // ============================================================================
1725 // UTF-32
1726 // ============================================================================
1727
1728 #ifdef WORDS_BIGENDIAN
1729     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1730     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1731 #else
1732     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1733     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1734 #endif
1735
1736
1737 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1738 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1739
1740 /* static */
1741 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1742 {
1743     if ( srcLen == wxNO_LEN )
1744     {
1745         // count the number of bytes in input, including the trailing NULs
1746         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1747         for ( srcLen = 1; *inBuff++; srcLen++ )
1748             ;
1749
1750         srcLen *= BYTES_PER_CHAR;
1751     }
1752     else // we already have the length
1753     {
1754         // we can only convert an entire number of UTF-32 characters
1755         if ( srcLen % BYTES_PER_CHAR )
1756             return wxCONV_FAILED;
1757     }
1758
1759     return srcLen;
1760 }
1761
1762 // case when in-memory representation is UTF-16
1763 #ifdef WC_UTF16
1764
1765 // ----------------------------------------------------------------------------
1766 // conversions without endianness change
1767 // ----------------------------------------------------------------------------
1768
1769 size_t
1770 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1771                                const char *src, size_t srcLen) const
1772 {
1773     srcLen = GetLength(src, srcLen);
1774     if ( srcLen == wxNO_LEN )
1775         return wxCONV_FAILED;
1776
1777     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1778     const size_t inLen = srcLen / BYTES_PER_CHAR;
1779     size_t outLen = 0;
1780     for ( size_t n = 0; n < inLen; n++ )
1781     {
1782         wxUint16 cc[2];
1783         const size_t numChars = encode_utf16(*inBuff++, cc);
1784         if ( numChars == wxCONV_FAILED )
1785             return wxCONV_FAILED;
1786
1787         outLen += numChars;
1788         if ( dst )
1789         {
1790             if ( outLen > dstLen )
1791                 return wxCONV_FAILED;
1792
1793             *dst++ = cc[0];
1794             if ( numChars == 2 )
1795             {
1796                 // second character of a surrogate
1797                 *dst++ = cc[1];
1798             }
1799         }
1800     }
1801
1802     return outLen;
1803 }
1804
1805 size_t
1806 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1807                                  const wchar_t *src, size_t srcLen) const
1808 {
1809     if ( srcLen == wxNO_LEN )
1810         srcLen = wxWcslen(src) + 1;
1811
1812     if ( !dst )
1813     {
1814         // optimization: return maximal space which could be needed for this
1815         // string instead of the exact amount which could be less if there are
1816         // any surrogates in the input
1817         //
1818         // we consider that surrogates are rare enough to make it worthwhile to
1819         // avoid running the loop below at the cost of slightly extra memory
1820         // consumption
1821         return srcLen * BYTES_PER_CHAR;
1822     }
1823
1824     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1825     size_t outLen = 0;
1826     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1827     {
1828         const wxUint32 ch = wxDecodeSurrogate(&src);
1829         if ( !src )
1830             return wxCONV_FAILED;
1831
1832         outLen += BYTES_PER_CHAR;
1833
1834         if ( outLen > dstLen )
1835             return wxCONV_FAILED;
1836
1837         *outBuff++ = ch;
1838     }
1839
1840     return outLen;
1841 }
1842
1843 // ----------------------------------------------------------------------------
1844 // endian-reversing conversions
1845 // ----------------------------------------------------------------------------
1846
1847 size_t
1848 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1849                            const char *src, size_t srcLen) const
1850 {
1851     srcLen = GetLength(src, srcLen);
1852     if ( srcLen == wxNO_LEN )
1853         return wxCONV_FAILED;
1854
1855     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1856     const size_t inLen = srcLen / BYTES_PER_CHAR;
1857     size_t outLen = 0;
1858     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1859     {
1860         wxUint16 cc[2];
1861         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1862         if ( numChars == wxCONV_FAILED )
1863             return wxCONV_FAILED;
1864
1865         outLen += numChars;
1866         if ( dst )
1867         {
1868             if ( outLen > dstLen )
1869                 return wxCONV_FAILED;
1870
1871             *dst++ = cc[0];
1872             if ( numChars == 2 )
1873             {
1874                 // second character of a surrogate
1875                 *dst++ = cc[1];
1876             }
1877         }
1878     }
1879
1880     return outLen;
1881 }
1882
1883 size_t
1884 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1885                              const wchar_t *src, size_t srcLen) const
1886 {
1887     if ( srcLen == wxNO_LEN )
1888         srcLen = wxWcslen(src) + 1;
1889
1890     if ( !dst )
1891     {
1892         // optimization: return maximal space which could be needed for this
1893         // string instead of the exact amount which could be less if there are
1894         // any surrogates in the input
1895         //
1896         // we consider that surrogates are rare enough to make it worthwhile to
1897         // avoid running the loop below at the cost of slightly extra memory
1898         // consumption
1899         return srcLen*BYTES_PER_CHAR;
1900     }
1901
1902     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1903     size_t outLen = 0;
1904     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1905     {
1906         const wxUint32 ch = wxDecodeSurrogate(&src);
1907         if ( !src )
1908             return wxCONV_FAILED;
1909
1910         outLen += BYTES_PER_CHAR;
1911
1912         if ( outLen > dstLen )
1913             return wxCONV_FAILED;
1914
1915         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1916     }
1917
1918     return outLen;
1919 }
1920
1921 #else // !WC_UTF16: wchar_t is UTF-32
1922
1923 // ----------------------------------------------------------------------------
1924 // conversions without endianness change
1925 // ----------------------------------------------------------------------------
1926
1927 size_t
1928 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1929                                const char *src, size_t srcLen) const
1930 {
1931     // use memcpy() as it should be much faster than hand-written loop
1932     srcLen = GetLength(src, srcLen);
1933     if ( srcLen == wxNO_LEN )
1934         return wxCONV_FAILED;
1935
1936     const size_t inLen = srcLen/BYTES_PER_CHAR;
1937     if ( dst )
1938     {
1939         if ( dstLen < inLen )
1940             return wxCONV_FAILED;
1941
1942         memcpy(dst, src, srcLen);
1943     }
1944
1945     return inLen;
1946 }
1947
1948 size_t
1949 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1950                                  const wchar_t *src, size_t srcLen) const
1951 {
1952     if ( srcLen == wxNO_LEN )
1953         srcLen = wxWcslen(src) + 1;
1954
1955     srcLen *= BYTES_PER_CHAR;
1956
1957     if ( dst )
1958     {
1959         if ( dstLen < srcLen )
1960             return wxCONV_FAILED;
1961
1962         memcpy(dst, src, srcLen);
1963     }
1964
1965     return srcLen;
1966 }
1967
1968 // ----------------------------------------------------------------------------
1969 // endian-reversing conversions
1970 // ----------------------------------------------------------------------------
1971
1972 size_t
1973 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1974                            const char *src, size_t srcLen) const
1975 {
1976     srcLen = GetLength(src, srcLen);
1977     if ( srcLen == wxNO_LEN )
1978         return wxCONV_FAILED;
1979
1980     srcLen /= BYTES_PER_CHAR;
1981
1982     if ( dst )
1983     {
1984         if ( dstLen < srcLen )
1985             return wxCONV_FAILED;
1986
1987         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1988         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1989         {
1990             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1991         }
1992     }
1993
1994     return srcLen;
1995 }
1996
1997 size_t
1998 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1999                              const wchar_t *src, size_t srcLen) const
2000 {
2001     if ( srcLen == wxNO_LEN )
2002         srcLen = wxWcslen(src) + 1;
2003
2004     srcLen *= BYTES_PER_CHAR;
2005
2006     if ( dst )
2007     {
2008         if ( dstLen < srcLen )
2009             return wxCONV_FAILED;
2010
2011         wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2012         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2013         {
2014             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2015         }
2016     }
2017
2018     return srcLen;
2019 }
2020
2021 #endif // WC_UTF16/!WC_UTF16
2022
2023
2024 // ============================================================================
2025 // The classes doing conversion using the iconv_xxx() functions
2026 // ============================================================================
2027
2028 #ifdef HAVE_ICONV
2029
2030 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2031 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
2032 //     (unless there's yet another bug in glibc) the only case when iconv()
2033 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
2034 //     left in the input buffer -- when _real_ error occurs,
2035 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2036 //     iconv() failure.
2037 //     [This bug does not appear in glibc 2.2.]
2038 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2039 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2040                                      (errno != E2BIG || bufLeft != 0))
2041 #else
2042 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2043 #endif
2044
2045 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
2046
2047 #define ICONV_T_INVALID ((iconv_t)-1)
2048
2049 #if SIZEOF_WCHAR_T == 4
2050     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2051     #define WC_ENC      wxFONTENCODING_UTF32
2052 #elif SIZEOF_WCHAR_T == 2
2053     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2054     #define WC_ENC      wxFONTENCODING_UTF16
2055 #else // sizeof(wchar_t) != 2 nor 4
2056     // does this ever happen?
2057     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2058 #endif
2059
2060 // ----------------------------------------------------------------------------
2061 // wxMBConv_iconv: encapsulates an iconv character set
2062 // ----------------------------------------------------------------------------
2063
2064 class wxMBConv_iconv : public wxMBConv
2065 {
2066 public:
2067     wxMBConv_iconv(const char *name);
2068     virtual ~wxMBConv_iconv();
2069
2070     // implement base class virtual methods
2071     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2072                            const char *src, size_t srcLen = wxNO_LEN) const;
2073     virtual size_t FromWChar(char *dst, size_t dstLen,
2074                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2075     virtual size_t GetMBNulLen() const;
2076
2077 #if wxUSE_UNICODE_UTF8
2078     virtual bool IsUTF8() const;
2079 #endif
2080
2081     virtual wxMBConv *Clone() const
2082     {
2083         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2084         p->m_minMBCharWidth = m_minMBCharWidth;
2085         return p;
2086     }
2087
2088     bool IsOk() const
2089         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2090
2091 protected:
2092     // the iconv handlers used to translate from multibyte
2093     // to wide char and in the other direction
2094     iconv_t m2w,
2095             w2m;
2096
2097 #if wxUSE_THREADS
2098     // guards access to m2w and w2m objects
2099     wxMutex m_iconvMutex;
2100 #endif
2101
2102 private:
2103     // the name (for iconv_open()) of a wide char charset -- if none is
2104     // available on this machine, it will remain NULL
2105     static wxString ms_wcCharsetName;
2106
2107     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2108     // different endian-ness than the native one
2109     static bool ms_wcNeedsSwap;
2110
2111
2112     // name of the encoding handled by this conversion
2113     wxString m_name;
2114
2115     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2116     // initially
2117     size_t m_minMBCharWidth;
2118 };
2119
2120 // make the constructor available for unit testing
2121 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2122 {
2123     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2124     if ( !result->IsOk() )
2125     {
2126         delete result;
2127         return 0;
2128     }
2129
2130     return result;
2131 }
2132
2133 wxString wxMBConv_iconv::ms_wcCharsetName;
2134 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2135
2136 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2137               : m_name(name)
2138 {
2139     m_minMBCharWidth = 0;
2140
2141     // check for charset that represents wchar_t:
2142     if ( ms_wcCharsetName.empty() )
2143     {
2144         wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2145
2146 #if wxUSE_FONTMAP
2147         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2148 #else // !wxUSE_FONTMAP
2149         static const wxChar *names_static[] =
2150         {
2151 #if SIZEOF_WCHAR_T == 4
2152             wxT("UCS-4"),
2153 #elif SIZEOF_WCHAR_T = 2
2154             wxT("UCS-2"),
2155 #endif
2156             NULL
2157         };
2158         const wxChar **names = names_static;
2159 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2160
2161         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2162         {
2163             const wxString nameCS(*names);
2164
2165             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2166             wxString nameXE(nameCS);
2167
2168 #ifdef WORDS_BIGENDIAN
2169                 nameXE += wxT("BE");
2170 #else // little endian
2171                 nameXE += wxT("LE");
2172 #endif
2173
2174             wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2175                        nameXE.c_str());
2176
2177             m2w = iconv_open(nameXE.ToAscii(), name);
2178             if ( m2w == ICONV_T_INVALID )
2179             {
2180                 // try charset w/o bytesex info (e.g. "UCS4")
2181                 wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2182                            nameCS.c_str());
2183                 m2w = iconv_open(nameCS.ToAscii(), name);
2184
2185                 // and check for bytesex ourselves:
2186                 if ( m2w != ICONV_T_INVALID )
2187                 {
2188                     char    buf[2], *bufPtr;
2189                     wchar_t wbuf[2];
2190                     size_t  insz, outsz;
2191                     size_t  res;
2192
2193                     buf[0] = 'A';
2194                     buf[1] = 0;
2195                     wbuf[0] = 0;
2196                     insz = 2;
2197                     outsz = SIZEOF_WCHAR_T * 2;
2198                     char* wbufPtr = (char*)wbuf;
2199                     bufPtr = buf;
2200
2201                     res = iconv(
2202                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2203                         &wbufPtr, &outsz);
2204
2205                     if (ICONV_FAILED(res, insz))
2206                     {
2207                         wxLogLastError(wxT("iconv"));
2208                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2209                                    nameCS.c_str());
2210                     }
2211                     else // ok, can convert to this encoding, remember it
2212                     {
2213                         ms_wcCharsetName = nameCS;
2214                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2215                     }
2216                 }
2217             }
2218             else // use charset not requiring byte swapping
2219             {
2220                 ms_wcCharsetName = nameXE;
2221             }
2222         }
2223
2224         wxLogTrace(TRACE_STRCONV,
2225                    wxT("iconv wchar_t charset is \"%s\"%s"),
2226                    ms_wcCharsetName.empty() ? wxString("<none>")
2227                                             : ms_wcCharsetName,
2228                    ms_wcNeedsSwap ? wxT(" (needs swap)")
2229                                   : wxT(""));
2230     }
2231     else // we already have ms_wcCharsetName
2232     {
2233         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2234     }
2235
2236     if ( ms_wcCharsetName.empty() )
2237     {
2238         w2m = ICONV_T_INVALID;
2239     }
2240     else
2241     {
2242         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2243         if ( w2m == ICONV_T_INVALID )
2244         {
2245             wxLogTrace(TRACE_STRCONV,
2246                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2247                        ms_wcCharsetName.c_str(), name);
2248         }
2249     }
2250 }
2251
2252 wxMBConv_iconv::~wxMBConv_iconv()
2253 {
2254     if ( m2w != ICONV_T_INVALID )
2255         iconv_close(m2w);
2256     if ( w2m != ICONV_T_INVALID )
2257         iconv_close(w2m);
2258 }
2259
2260 size_t
2261 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2262                         const char *src, size_t srcLen) const
2263 {
2264     if ( srcLen == wxNO_LEN )
2265     {
2266         // find the string length: notice that must be done differently for
2267         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2268         // consecutive NULs
2269         const size_t nulLen = GetMBNulLen();
2270         switch ( nulLen )
2271         {
2272             default:
2273                 return wxCONV_FAILED;
2274
2275             case 1:
2276                 srcLen = strlen(src); // arguably more optimized than our version
2277                 break;
2278
2279             case 2:
2280             case 4:
2281                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2282                 // but they also have to start at character boundary and not
2283                 // span two adjacent characters
2284                 const char *p;
2285                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2286                     ;
2287                 srcLen = p - src;
2288                 break;
2289         }
2290
2291         // when we're determining the length of the string ourselves we count
2292         // the terminating NUL(s) as part of it and always NUL-terminate the
2293         // output
2294         srcLen += nulLen;
2295     }
2296
2297     // we express length in the number of (wide) characters but iconv always
2298     // counts buffer sizes it in bytes
2299     dstLen *= SIZEOF_WCHAR_T;
2300
2301 #if wxUSE_THREADS
2302     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2303     //     Unfortunately there are a couple of global wxCSConv objects such as
2304     //     wxConvLocal that are used all over wx code, so we have to make sure
2305     //     the handle is used by at most one thread at the time. Otherwise
2306     //     only a few wx classes would be safe to use from non-main threads
2307     //     as MB<->WC conversion would fail "randomly".
2308     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2309 #endif // wxUSE_THREADS
2310
2311     size_t res, cres;
2312     const char *pszPtr = src;
2313
2314     if ( dst )
2315     {
2316         char* bufPtr = (char*)dst;
2317
2318         // have destination buffer, convert there
2319         size_t dstLenOrig = dstLen;
2320         cres = iconv(m2w,
2321                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2322                      &bufPtr, &dstLen);
2323
2324         // convert the number of bytes converted as returned by iconv to the
2325         // number of (wide) characters converted that we need
2326         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2327
2328         if (ms_wcNeedsSwap)
2329         {
2330             // convert to native endianness
2331             for ( unsigned i = 0; i < res; i++ )
2332                 dst[i] = WC_BSWAP(dst[i]);
2333         }
2334     }
2335     else // no destination buffer
2336     {
2337         // convert using temp buffer to calculate the size of the buffer needed
2338         wchar_t tbuf[256];
2339         res = 0;
2340
2341         do
2342         {
2343             char* bufPtr = (char*)tbuf;
2344             dstLen = 8 * SIZEOF_WCHAR_T;
2345
2346             cres = iconv(m2w,
2347                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2348                          &bufPtr, &dstLen );
2349
2350             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2351         }
2352         while ((cres == (size_t)-1) && (errno == E2BIG));
2353     }
2354
2355     if (ICONV_FAILED(cres, srcLen))
2356     {
2357         //VS: it is ok if iconv fails, hence trace only
2358         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2359         return wxCONV_FAILED;
2360     }
2361
2362     return res;
2363 }
2364
2365 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2366                                  const wchar_t *src, size_t srcLen) const
2367 {
2368 #if wxUSE_THREADS
2369     // NB: explained in MB2WC
2370     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2371 #endif
2372
2373     if ( srcLen == wxNO_LEN )
2374         srcLen = wxWcslen(src) + 1;
2375
2376     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2377     size_t outbuflen = dstLen;
2378     size_t res, cres;
2379
2380     wchar_t *tmpbuf = 0;
2381
2382     if (ms_wcNeedsSwap)
2383     {
2384         // need to copy to temp buffer to switch endianness
2385         // (doing WC_BSWAP twice on the original buffer won't work, as it
2386         //  could be in read-only memory, or be accessed in some other thread)
2387         tmpbuf = (wchar_t *)malloc(inbuflen);
2388         for ( size_t i = 0; i < srcLen; i++ )
2389             tmpbuf[i] = WC_BSWAP(src[i]);
2390
2391         src = tmpbuf;
2392     }
2393
2394     char* inbuf = (char*)src;
2395     if ( dst )
2396     {
2397         // have destination buffer, convert there
2398         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2399
2400         res = dstLen - outbuflen;
2401     }
2402     else // no destination buffer
2403     {
2404         // convert using temp buffer to calculate the size of the buffer needed
2405         char tbuf[256];
2406         res = 0;
2407         do
2408         {
2409             dst = tbuf;
2410             outbuflen = WXSIZEOF(tbuf);
2411
2412             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2413
2414             res += WXSIZEOF(tbuf) - outbuflen;
2415         }
2416         while ((cres == (size_t)-1) && (errno == E2BIG));
2417     }
2418
2419     if (ms_wcNeedsSwap)
2420     {
2421         free(tmpbuf);
2422     }
2423
2424     if (ICONV_FAILED(cres, inbuflen))
2425     {
2426         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2427         return wxCONV_FAILED;
2428     }
2429
2430     return res;
2431 }
2432
2433 size_t wxMBConv_iconv::GetMBNulLen() const
2434 {
2435     if ( m_minMBCharWidth == 0 )
2436     {
2437         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2438
2439 #if wxUSE_THREADS
2440         // NB: explained in MB2WC
2441         wxMutexLocker lock(self->m_iconvMutex);
2442 #endif
2443
2444         const wchar_t *wnul = L"";
2445         char buf[8]; // should be enough for NUL in any encoding
2446         size_t inLen = sizeof(wchar_t),
2447                outLen = WXSIZEOF(buf);
2448         char *inBuff = (char *)wnul;
2449         char *outBuff = buf;
2450         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2451         {
2452             self->m_minMBCharWidth = (size_t)-1;
2453         }
2454         else // ok
2455         {
2456             self->m_minMBCharWidth = outBuff - buf;
2457         }
2458     }
2459
2460     return m_minMBCharWidth;
2461 }
2462
2463 #if wxUSE_UNICODE_UTF8
2464 bool wxMBConv_iconv::IsUTF8() const
2465 {
2466     return wxStricmp(m_name, "UTF-8") == 0 ||
2467            wxStricmp(m_name, "UTF8") == 0;
2468 }
2469 #endif
2470
2471 #endif // HAVE_ICONV
2472
2473
2474 // ============================================================================
2475 // Win32 conversion classes
2476 // ============================================================================
2477
2478 #ifdef wxHAVE_WIN32_MB2WC
2479
2480 // from utils.cpp
2481 #if wxUSE_FONTMAP
2482 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2483 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2484 #endif
2485
2486 class wxMBConv_win32 : public wxMBConv
2487 {
2488 public:
2489     wxMBConv_win32()
2490     {
2491         m_CodePage = CP_ACP;
2492         m_minMBCharWidth = 0;
2493     }
2494
2495     wxMBConv_win32(const wxMBConv_win32& conv)
2496         : wxMBConv()
2497     {
2498         m_CodePage = conv.m_CodePage;
2499         m_minMBCharWidth = conv.m_minMBCharWidth;
2500     }
2501
2502 #if wxUSE_FONTMAP
2503     wxMBConv_win32(const char* name)
2504     {
2505         m_CodePage = wxCharsetToCodepage(name);
2506         m_minMBCharWidth = 0;
2507     }
2508
2509     wxMBConv_win32(wxFontEncoding encoding)
2510     {
2511         m_CodePage = wxEncodingToCodepage(encoding);
2512         m_minMBCharWidth = 0;
2513     }
2514 #endif // wxUSE_FONTMAP
2515
2516     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2517     {
2518         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2519         // the behaviour is not compatible with the Unix version (using iconv)
2520         // and break the library itself, e.g. wxTextInputStream::NextChar()
2521         // wouldn't work if reading an incomplete MB char didn't result in an
2522         // error
2523         //
2524         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2525         // Win XP or newer and it is not supported for UTF-[78] so we always
2526         // use our own conversions in this case. See
2527         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2528         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2529         if ( m_CodePage == CP_UTF8 )
2530         {
2531             return wxMBConvUTF8().MB2WC(buf, psz, n);
2532         }
2533
2534         if ( m_CodePage == CP_UTF7 )
2535         {
2536             return wxMBConvUTF7().MB2WC(buf, psz, n);
2537         }
2538
2539         int flags = 0;
2540         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2541                 IsAtLeastWin2kSP4() )
2542         {
2543             flags = MB_ERR_INVALID_CHARS;
2544         }
2545
2546         const size_t len = ::MultiByteToWideChar
2547                              (
2548                                 m_CodePage,     // code page
2549                                 flags,          // flags: fall on error
2550                                 psz,            // input string
2551                                 -1,             // its length (NUL-terminated)
2552                                 buf,            // output string
2553                                 buf ? n : 0     // size of output buffer
2554                              );
2555         if ( !len )
2556         {
2557             // function totally failed
2558             return wxCONV_FAILED;
2559         }
2560
2561         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2562         // check if we succeeded, by doing a double trip:
2563         if ( !flags && buf )
2564         {
2565             const size_t mbLen = strlen(psz);
2566             wxCharBuffer mbBuf(mbLen);
2567             if ( ::WideCharToMultiByte
2568                    (
2569                       m_CodePage,
2570                       0,
2571                       buf,
2572                       -1,
2573                       mbBuf.data(),
2574                       mbLen + 1,        // size in bytes, not length
2575                       NULL,
2576                       NULL
2577                    ) == 0 ||
2578                   strcmp(mbBuf, psz) != 0 )
2579             {
2580                 // we didn't obtain the same thing we started from, hence
2581                 // the conversion was lossy and we consider that it failed
2582                 return wxCONV_FAILED;
2583             }
2584         }
2585
2586         // note that it returns count of written chars for buf != NULL and size
2587         // of the needed buffer for buf == NULL so in either case the length of
2588         // the string (which never includes the terminating NUL) is one less
2589         return len - 1;
2590     }
2591
2592     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2593     {
2594         /*
2595             we have a problem here: by default, WideCharToMultiByte() may
2596             replace characters unrepresentable in the target code page with bad
2597             quality approximations such as turning "1/2" symbol (U+00BD) into
2598             "1" for the code pages which don't have it and we, obviously, want
2599             to avoid this at any price
2600
2601             the trouble is that this function does it _silently_, i.e. it won't
2602             even tell us whether it did or not... Win98/2000 and higher provide
2603             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2604             we have to resort to a round trip, i.e. check that converting back
2605             results in the same string -- this is, of course, expensive but
2606             otherwise we simply can't be sure to not garble the data.
2607          */
2608
2609         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2610         // it doesn't work with CJK encodings (which we test for rather roughly
2611         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2612         // supporting it
2613         BOOL usedDef wxDUMMY_INITIALIZE(false);
2614         BOOL *pUsedDef;
2615         int flags;
2616         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2617         {
2618             // it's our lucky day
2619             flags = WC_NO_BEST_FIT_CHARS;
2620             pUsedDef = &usedDef;
2621         }
2622         else // old system or unsupported encoding
2623         {
2624             flags = 0;
2625             pUsedDef = NULL;
2626         }
2627
2628         const size_t len = ::WideCharToMultiByte
2629                              (
2630                                 m_CodePage,     // code page
2631                                 flags,          // either none or no best fit
2632                                 pwz,            // input string
2633                                 -1,             // it is (wide) NUL-terminated
2634                                 buf,            // output buffer
2635                                 buf ? n : 0,    // and its size
2636                                 NULL,           // default "replacement" char
2637                                 pUsedDef        // [out] was it used?
2638                              );
2639
2640         if ( !len )
2641         {
2642             // function totally failed
2643             return wxCONV_FAILED;
2644         }
2645
2646         // we did something, check if we really succeeded
2647         if ( flags )
2648         {
2649             // check if the conversion failed, i.e. if any replacements
2650             // were done
2651             if ( usedDef )
2652                 return wxCONV_FAILED;
2653         }
2654         else // we must resort to double tripping...
2655         {
2656             // first we need to ensure that we really have the MB data: this is
2657             // not the case if we're called with NULL buffer, in which case we
2658             // need to do the conversion yet again
2659             wxCharBuffer bufDef;
2660             if ( !buf )
2661             {
2662                 bufDef = wxCharBuffer(len);
2663                 buf = bufDef.data();
2664                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2665                                             buf, len, NULL, NULL) )
2666                     return wxCONV_FAILED;
2667             }
2668
2669             if ( !n )
2670                 n = wcslen(pwz);
2671             wxWCharBuffer wcBuf(n);
2672             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2673                     wcscmp(wcBuf, pwz) != 0 )
2674             {
2675                 // we didn't obtain the same thing we started from, hence
2676                 // the conversion was lossy and we consider that it failed
2677                 return wxCONV_FAILED;
2678             }
2679         }
2680
2681         // see the comment above for the reason of "len - 1"
2682         return len - 1;
2683     }
2684
2685     virtual size_t GetMBNulLen() const
2686     {
2687         if ( m_minMBCharWidth == 0 )
2688         {
2689             int len = ::WideCharToMultiByte
2690                         (
2691                             m_CodePage,     // code page
2692                             0,              // no flags
2693                             L"",            // input string
2694                             1,              // translate just the NUL
2695                             NULL,           // output buffer
2696                             0,              // and its size
2697                             NULL,           // no replacement char
2698                             NULL            // [out] don't care if it was used
2699                         );
2700
2701             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2702             switch ( len )
2703             {
2704                 default:
2705                     wxLogDebug(wxT("Unexpected NUL length %d"), len);
2706                     self->m_minMBCharWidth = (size_t)-1;
2707                     break;
2708
2709                 case 0:
2710                     self->m_minMBCharWidth = (size_t)-1;
2711                     break;
2712
2713                 case 1:
2714                 case 2:
2715                 case 4:
2716                     self->m_minMBCharWidth = len;
2717                     break;
2718             }
2719         }
2720
2721         return m_minMBCharWidth;
2722     }
2723
2724     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2725
2726     bool IsOk() const { return m_CodePage != -1; }
2727
2728 private:
2729     static bool CanUseNoBestFit()
2730     {
2731         static int s_isWin98Or2k = -1;
2732
2733         if ( s_isWin98Or2k == -1 )
2734         {
2735             int verMaj, verMin;
2736             switch ( wxGetOsVersion(&verMaj, &verMin) )
2737             {
2738                 case wxOS_WINDOWS_9X:
2739                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2740                     break;
2741
2742                 case wxOS_WINDOWS_NT:
2743                     s_isWin98Or2k = verMaj >= 5;
2744                     break;
2745
2746                 default:
2747                     // unknown: be conservative by default
2748                     s_isWin98Or2k = 0;
2749                     break;
2750             }
2751
2752             wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2753         }
2754
2755         return s_isWin98Or2k == 1;
2756     }
2757
2758     static bool IsAtLeastWin2kSP4()
2759     {
2760 #ifdef __WXWINCE__
2761         return false;
2762 #else
2763         static int s_isAtLeastWin2kSP4 = -1;
2764
2765         if ( s_isAtLeastWin2kSP4 == -1 )
2766         {
2767             OSVERSIONINFOEX ver;
2768
2769             memset(&ver, 0, sizeof(ver));
2770             ver.dwOSVersionInfoSize = sizeof(ver);
2771             GetVersionEx((OSVERSIONINFO*)&ver);
2772
2773             s_isAtLeastWin2kSP4 =
2774               ((ver.dwMajorVersion > 5) || // Vista+
2775                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2776                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2777                ver.wServicePackMajor >= 4)) // 2000 SP4+
2778               ? 1 : 0;
2779         }
2780
2781         return s_isAtLeastWin2kSP4 == 1;
2782 #endif
2783     }
2784
2785
2786     // the code page we're working with
2787     long m_CodePage;
2788
2789     // cached result of GetMBNulLen(), set to 0 initially meaning
2790     // "unknown"
2791     size_t m_minMBCharWidth;
2792 };
2793
2794 #endif // wxHAVE_WIN32_MB2WC
2795
2796
2797 // ============================================================================
2798 // wxEncodingConverter based conversion classes
2799 // ============================================================================
2800
2801 #if wxUSE_FONTMAP
2802
2803 class wxMBConv_wxwin : public wxMBConv
2804 {
2805 private:
2806     void Init()
2807     {
2808         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2809         // The wxMBConv_cf class does a better job.
2810         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2811                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2812                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2813     }
2814
2815 public:
2816     // temporarily just use wxEncodingConverter stuff,
2817     // so that it works while a better implementation is built
2818     wxMBConv_wxwin(const char* name)
2819     {
2820         if (name)
2821             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2822         else
2823             m_enc = wxFONTENCODING_SYSTEM;
2824
2825         Init();
2826     }
2827
2828     wxMBConv_wxwin(wxFontEncoding enc)
2829     {
2830         m_enc = enc;
2831
2832         Init();
2833     }
2834
2835     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2836     {
2837         size_t inbuf = strlen(psz);
2838         if (buf)
2839         {
2840             if (!m2w.Convert(psz, buf))
2841                 return wxCONV_FAILED;
2842         }
2843         return inbuf;
2844     }
2845
2846     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2847     {
2848         const size_t inbuf = wxWcslen(psz);
2849         if (buf)
2850         {
2851             if (!w2m.Convert(psz, buf))
2852                 return wxCONV_FAILED;
2853         }
2854
2855         return inbuf;
2856     }
2857
2858     virtual size_t GetMBNulLen() const
2859     {
2860         switch ( m_enc )
2861         {
2862             case wxFONTENCODING_UTF16BE:
2863             case wxFONTENCODING_UTF16LE:
2864                 return 2;
2865
2866             case wxFONTENCODING_UTF32BE:
2867             case wxFONTENCODING_UTF32LE:
2868                 return 4;
2869
2870             default:
2871                 return 1;
2872         }
2873     }
2874
2875     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2876
2877     bool IsOk() const { return m_ok; }
2878
2879 public:
2880     wxFontEncoding m_enc;
2881     wxEncodingConverter m2w, w2m;
2882
2883 private:
2884     // were we initialized successfully?
2885     bool m_ok;
2886
2887     wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2888 };
2889
2890 // make the constructors available for unit testing
2891 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2892 {
2893     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2894     if ( !result->IsOk() )
2895     {
2896         delete result;
2897         return 0;
2898     }
2899
2900     return result;
2901 }
2902
2903 #endif // wxUSE_FONTMAP
2904
2905 // ============================================================================
2906 // wxCSConv implementation
2907 // ============================================================================
2908
2909 void wxCSConv::Init()
2910 {
2911     m_name = NULL;
2912     m_convReal =  NULL;
2913     m_deferred = true;
2914 }
2915
2916 wxCSConv::wxCSConv(const wxString& charset)
2917 {
2918     Init();
2919
2920     if ( !charset.empty() )
2921     {
2922         SetName(charset.ToAscii());
2923     }
2924
2925 #if wxUSE_FONTMAP
2926     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2927     if ( m_encoding == wxFONTENCODING_MAX )
2928     {
2929         // set to unknown/invalid value
2930         m_encoding = wxFONTENCODING_SYSTEM;
2931     }
2932     else if ( m_encoding == wxFONTENCODING_DEFAULT )
2933     {
2934         // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2935         m_encoding = wxFONTENCODING_ISO8859_1;
2936     }
2937 #else
2938     m_encoding = wxFONTENCODING_SYSTEM;
2939 #endif
2940 }
2941
2942 wxCSConv::wxCSConv(wxFontEncoding encoding)
2943 {
2944     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2945     {
2946         wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2947
2948         encoding = wxFONTENCODING_SYSTEM;
2949     }
2950
2951     Init();
2952
2953     m_encoding = encoding;
2954 }
2955
2956 wxCSConv::~wxCSConv()
2957 {
2958     Clear();
2959 }
2960
2961 wxCSConv::wxCSConv(const wxCSConv& conv)
2962         : wxMBConv()
2963 {
2964     Init();
2965
2966     SetName(conv.m_name);
2967     m_encoding = conv.m_encoding;
2968 }
2969
2970 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2971 {
2972     Clear();
2973
2974     SetName(conv.m_name);
2975     m_encoding = conv.m_encoding;
2976
2977     return *this;
2978 }
2979
2980 void wxCSConv::Clear()
2981 {
2982     free(m_name);
2983     delete m_convReal;
2984
2985     m_name = NULL;
2986     m_convReal = NULL;
2987 }
2988
2989 void wxCSConv::SetName(const char *charset)
2990 {
2991     if (charset)
2992     {
2993         m_name = wxStrdup(charset);
2994         m_deferred = true;
2995     }
2996 }
2997
2998 #if wxUSE_FONTMAP
2999
3000 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3001                      wxEncodingNameCache );
3002
3003 static wxEncodingNameCache gs_nameCache;
3004 #endif
3005
3006 wxMBConv *wxCSConv::DoCreate() const
3007 {
3008 #if wxUSE_FONTMAP
3009     wxLogTrace(TRACE_STRCONV,
3010                wxT("creating conversion for %s"),
3011                (m_name ? m_name
3012                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3013 #endif // wxUSE_FONTMAP
3014
3015     // check for the special case of ASCII or ISO8859-1 charset: as we have
3016     // special knowledge of it anyhow, we don't need to create a special
3017     // conversion object
3018     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3019             m_encoding == wxFONTENCODING_DEFAULT )
3020     {
3021         // don't convert at all
3022         return NULL;
3023     }
3024
3025     // we trust OS to do conversion better than we can so try external
3026     // conversion methods first
3027     //
3028     // the full order is:
3029     //      1. OS conversion (iconv() under Unix or Win32 API)
3030     //      2. hard coded conversions for UTF
3031     //      3. wxEncodingConverter as fall back
3032
3033     // step (1)
3034 #ifdef HAVE_ICONV
3035 #if !wxUSE_FONTMAP
3036     if ( m_name )
3037 #endif // !wxUSE_FONTMAP
3038     {
3039 #if wxUSE_FONTMAP
3040         wxFontEncoding encoding(m_encoding);
3041 #endif
3042
3043         if ( m_name )
3044         {
3045             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3046             if ( conv->IsOk() )
3047                 return conv;
3048
3049             delete conv;
3050
3051 #if wxUSE_FONTMAP
3052             encoding =
3053                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3054 #endif // wxUSE_FONTMAP
3055         }
3056 #if wxUSE_FONTMAP
3057         {
3058             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3059             if ( it != gs_nameCache.end() )
3060             {
3061                 if ( it->second.empty() )
3062                     return NULL;
3063
3064                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3065                 if ( conv->IsOk() )
3066                     return conv;
3067
3068                 delete conv;
3069             }
3070
3071             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3072             // CS : in case this does not return valid names (eg for MacRoman)
3073             // encoding got a 'failure' entry in the cache all the same,
3074             // although it just has to be created using a different method, so
3075             // only store failed iconv creation attempts (or perhaps we
3076             // shoulnd't do this at all ?)
3077             if ( names[0] != NULL )
3078             {
3079                 for ( ; *names; ++names )
3080                 {
3081                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3082                     //             will need changes that will obsolete this
3083                     wxString name(*names);
3084                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3085                     if ( conv->IsOk() )
3086                     {
3087                         gs_nameCache[encoding] = *names;
3088                         return conv;
3089                     }
3090
3091                     delete conv;
3092                 }
3093
3094                 gs_nameCache[encoding] = wxT(""); // cache the failure
3095             }
3096         }
3097 #endif // wxUSE_FONTMAP
3098     }
3099 #endif // HAVE_ICONV
3100
3101 #ifdef wxHAVE_WIN32_MB2WC
3102     {
3103 #if wxUSE_FONTMAP
3104         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3105                                       : new wxMBConv_win32(m_encoding);
3106         if ( conv->IsOk() )
3107             return conv;
3108
3109         delete conv;
3110 #else
3111         return NULL;
3112 #endif
3113     }
3114 #endif // wxHAVE_WIN32_MB2WC
3115
3116 #ifdef __DARWIN__
3117     {
3118         // leave UTF16 and UTF32 to the built-ins of wx
3119         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3120             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3121         {
3122 #if wxUSE_FONTMAP
3123             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3124                                           : new wxMBConv_cf(m_encoding);
3125 #else
3126             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3127 #endif
3128
3129             if ( conv->IsOk() )
3130                  return conv;
3131
3132             delete conv;
3133         }
3134     }
3135 #endif // __DARWIN__
3136
3137     // step (2)
3138     wxFontEncoding enc = m_encoding;
3139 #if wxUSE_FONTMAP
3140     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3141     {
3142         // use "false" to suppress interactive dialogs -- we can be called from
3143         // anywhere and popping up a dialog from here is the last thing we want to
3144         // do
3145         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3146     }
3147 #endif // wxUSE_FONTMAP
3148
3149     switch ( enc )
3150     {
3151         case wxFONTENCODING_UTF7:
3152              return new wxMBConvUTF7;
3153
3154         case wxFONTENCODING_UTF8:
3155              return new wxMBConvUTF8;
3156
3157         case wxFONTENCODING_UTF16BE:
3158              return new wxMBConvUTF16BE;
3159
3160         case wxFONTENCODING_UTF16LE:
3161              return new wxMBConvUTF16LE;
3162
3163         case wxFONTENCODING_UTF32BE:
3164              return new wxMBConvUTF32BE;
3165
3166         case wxFONTENCODING_UTF32LE:
3167              return new wxMBConvUTF32LE;
3168
3169         default:
3170              // nothing to do but put here to suppress gcc warnings
3171              break;
3172     }
3173
3174     // step (3)
3175 #if wxUSE_FONTMAP
3176     {
3177         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3178                                       : new wxMBConv_wxwin(m_encoding);
3179         if ( conv->IsOk() )
3180             return conv;
3181
3182         delete conv;
3183     }
3184
3185     wxLogTrace(TRACE_STRCONV,
3186                wxT("encoding \"%s\" is not supported by this system"),
3187                (m_name ? wxString(m_name)
3188                        : wxFontMapperBase::GetEncodingName(m_encoding)));
3189 #endif // wxUSE_FONTMAP
3190
3191     return NULL;
3192 }
3193
3194 void wxCSConv::CreateConvIfNeeded() const
3195 {
3196     if ( m_deferred )
3197     {
3198         wxCSConv *self = (wxCSConv *)this; // const_cast
3199
3200         // if we don't have neither the name nor the encoding, use the default
3201         // encoding for this system
3202         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3203         {
3204 #if wxUSE_INTL
3205             self->m_encoding = wxLocale::GetSystemEncoding();
3206 #else
3207             // fallback to some reasonable default:
3208             self->m_encoding = wxFONTENCODING_ISO8859_1;
3209 #endif // wxUSE_INTL
3210         }
3211
3212         self->m_convReal = DoCreate();
3213         self->m_deferred = false;
3214     }
3215 }
3216
3217 bool wxCSConv::IsOk() const
3218 {
3219     CreateConvIfNeeded();
3220
3221     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3222     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3223         return true; // always ok as we do it ourselves
3224
3225     // m_convReal->IsOk() is called at its own creation, so we know it must
3226     // be ok if m_convReal is non-NULL
3227     return m_convReal != NULL;
3228 }
3229
3230 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3231                          const char *src, size_t srcLen) const
3232 {
3233     CreateConvIfNeeded();
3234
3235     if (m_convReal)
3236         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3237
3238     // latin-1 (direct)
3239     if ( srcLen == wxNO_LEN )
3240         srcLen = strlen(src) + 1; // take trailing NUL too
3241
3242     if ( dst )
3243     {
3244         if ( dstLen < srcLen )
3245             return wxCONV_FAILED;
3246
3247         for ( size_t n = 0; n < srcLen; n++ )
3248             dst[n] = (unsigned char)(src[n]);
3249     }
3250
3251     return srcLen;
3252 }
3253
3254 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3255                            const wchar_t *src, size_t srcLen) const
3256 {
3257     CreateConvIfNeeded();
3258
3259     if (m_convReal)
3260         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3261
3262     // latin-1 (direct)
3263     if ( srcLen == wxNO_LEN )
3264         srcLen = wxWcslen(src) + 1;
3265
3266     if ( dst )
3267     {
3268         if ( dstLen < srcLen )
3269             return wxCONV_FAILED;
3270
3271         for ( size_t n = 0; n < srcLen; n++ )
3272         {
3273             if ( src[n] > 0xFF )
3274                 return wxCONV_FAILED;
3275
3276             dst[n] = (char)src[n];
3277         }
3278
3279     }
3280     else // still need to check the input validity
3281     {
3282         for ( size_t n = 0; n < srcLen; n++ )
3283         {
3284             if ( src[n] > 0xFF )
3285                 return wxCONV_FAILED;
3286         }
3287     }
3288
3289     return srcLen;
3290 }
3291
3292 size_t wxCSConv::GetMBNulLen() const
3293 {
3294     CreateConvIfNeeded();
3295
3296     if ( m_convReal )
3297     {
3298         return m_convReal->GetMBNulLen();
3299     }
3300
3301     // otherwise, we are ISO-8859-1
3302     return 1;
3303 }
3304
3305 #if wxUSE_UNICODE_UTF8
3306 bool wxCSConv::IsUTF8() const
3307 {
3308     CreateConvIfNeeded();
3309
3310     if ( m_convReal )
3311     {
3312         return m_convReal->IsUTF8();
3313     }
3314
3315     // otherwise, we are ISO-8859-1
3316     return false;
3317 }
3318 #endif
3319
3320
3321 #if wxUSE_UNICODE
3322
3323 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3324 {
3325     if ( !s )
3326         return wxWCharBuffer();
3327
3328     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3329     if ( !wbuf )
3330         wbuf = wxMBConvUTF8().cMB2WX(s);
3331     if ( !wbuf )
3332         wbuf = wxConvISO8859_1.cMB2WX(s);
3333
3334     return wbuf;
3335 }
3336
3337 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3338 {
3339     if ( !ws )
3340         return wxCharBuffer();
3341
3342     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3343     if ( !buf )
3344         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3345
3346     return buf;
3347 }
3348
3349 #endif // wxUSE_UNICODE
3350
3351 // ----------------------------------------------------------------------------
3352 // globals
3353 // ----------------------------------------------------------------------------
3354
3355 // NB: The reason why we create converted objects in this convoluted way,
3356 //     using a factory function instead of global variable, is that they
3357 //     may be used at static initialization time (some of them are used by
3358 //     wxString ctors and there may be a global wxString object). In other
3359 //     words, possibly _before_ the converter global object would be
3360 //     initialized.
3361
3362 #undef wxConvLibc
3363 #undef wxConvUTF8
3364 #undef wxConvUTF7
3365 #undef wxConvLocal
3366 #undef wxConvISO8859_1
3367
3368 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3369     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3370     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3371     {                                                                   \
3372         static impl_klass name##Obj ctor_args;                          \
3373         return &name##Obj;                                              \
3374     }                                                                   \
3375     /* this ensures that all global converter objects are created */    \
3376     /* by the time static initialization is done, i.e. before any */    \
3377     /* thread is launched: */                                           \
3378     static klass* gs_##name##instance = wxGet_##name##Ptr()
3379
3380 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3381     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3382
3383 #ifdef __INTELC__
3384     // disable warning "variable 'xxx' was declared but never referenced"
3385     #pragma warning(disable: 177)
3386 #endif // Intel C++
3387
3388 #ifdef __WINDOWS__
3389     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3390 #elif 0 // defined(__WXOSX__)
3391     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc,  (wxFONTENCODING_UTF8));
3392 #else
3393     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3394 #endif
3395
3396 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3397 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3398 //     provokes an error message about "not enough macro parameters"; and we
3399 //     can't use "()" here as the name##Obj declaration would be parsed as a
3400 //     function declaration then, so use a semicolon and live with an extra
3401 //     empty statement (and hope that no compilers warns about this)
3402 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3403 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3404
3405 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3406 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3407
3408 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3409 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3410
3411 #ifdef __DARWIN__
3412 // The xnu kernel always communicates file paths in decomposed UTF-8.
3413 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3414 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3415 #endif
3416
3417 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3418 #ifdef __DARWIN__
3419                                     &wxConvMacUTF8DObj;
3420 #else // !__DARWIN__
3421                                     wxGet_wxConvLibcPtr();
3422 #endif // __DARWIN__/!__DARWIN__
3423
3424 #else // !wxUSE_WCHAR_T
3425
3426 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3427 // stand-ins in absence of wchar_t
3428 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3429                                 wxConvISO8859_1,
3430                                 wxConvLocal,
3431                                 wxConvUTF8;
3432
3433 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T