src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/osx/core/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV wxT("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existing ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168     //
 169     // moreover, some conversion classes simply can't implement ToWChar()
 170     // directly, the primary example is wxConvLibc: mbstowcs() only handles
 171     // NUL-terminated strings
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     // the idea of this code is straightforward: it converts a NUL-terminated
 213     // chunk of the string during each iteration and updates the output buffer
 214     // with the result
 215     //
 216     // all the complication come from the fact that this function, for
 217     // historical reasons, must behave in 2 subtly different ways when it's
 218     // called with a fixed number of characters and when it's called for the
 219     // entire NUL-terminated string: in the former case (srcEnd == NULL) we
 220     // must count all characters we convert, NUL or not; but in the latter we
 221     // do not count the trailing NUL -- but still count all the NULs inside the
 222     // string
 223     //
 224     // so for the (simple) former case we just always count the trailing NUL,
 225     // but for the latter we need to wait until we see if there is going to be
 226     // another loop iteration and only count it then
 227     for ( ;; )
 228     {
 229         // try to convert the current chunk
 230         size_t lenChunk = MB2WC(NULL, src, 0);
 231         if ( lenChunk == wxCONV_FAILED )
 232             return wxCONV_FAILED;
 233
 234         dstWritten += lenChunk;
 235         if ( !srcEnd )
 236             dstWritten++;
 237
 238         if ( !lenChunk )
 239         {
 240             // nothing left in the input string, conversion succeeded
 241             break;
 242         }
 243
 244         if ( dst )
 245         {
 246             if ( dstWritten > dstLen )
 247                 return wxCONV_FAILED;
 248
 249             // +1 is for trailing NUL
 250             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 251                 return wxCONV_FAILED;
 252
 253             dst += lenChunk;
 254             if ( !srcEnd )
 255                 dst++;
 256         }
 257
 258         if ( !srcEnd )
 259         {
 260             // we convert just one chunk in this case as this is the entire
 261             // string anyhow
 262             break;
 263         }
 264
 265         // advance the input pointer past the end of this chunk
 266         while ( NotAllNULs(src, nulLen) )
 267         {
 268             // notice that we must skip over multiple bytes here as we suppose
 269             // that if NUL takes 2 or 4 bytes, then all the other characters do
 270             // too and so if advanced by a single byte we might erroneously
 271             // detect sequences of NUL bytes in the middle of the input
 272             src += nulLen;
 273         }
 274
 275         src += nulLen; // skipping over its terminator as well
 276
 277         // note that ">=" (and not just "==") is needed here as the terminator
 278         // we skipped just above could be inside or just after the buffer
 279         // delimited by srcEnd
 280         if ( src >= srcEnd )
 281             break;
 282
 283         // if we got here then this wasn't the last chunk in this string and
 284         // hence we must count an extra char for L'\0' even when converting a
 285         // fixed number of characters
 286         if ( srcEnd )
 287         {
 288             dstWritten++;
 289             if ( dst )
 290                 dst++;
 291         }
 292     }
 293
 294     return dstWritten;
 295 }
 296
 297 size_t
 298 wxMBConv::FromWChar(char *dst, size_t dstLen,
 299                     const wchar_t *src, size_t srcLen) const
 300 {
 301     // the number of chars [which would be] written to dst [if it were not NULL]
 302     size_t dstWritten = 0;
 303
 304     // if we don't know its length we have no choice but to assume that it is
 305     // NUL-terminated (notice that it can still be NUL-terminated even if
 306     // explicit length is given but it doesn't change our return value)
 307     const bool isNulTerminated = srcLen == wxNO_LEN;
 308
 309     // make a copy of the input string unless it is already properly
 310     // NUL-terminated
 311     wxWCharBuffer bufTmp;
 312     if ( isNulTerminated )
 313     {
 314         srcLen = wxWcslen(src) + 1;
 315     }
 316     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 317     {
 318         // make a copy in order to properly NUL-terminate the string
 319         bufTmp = wxWCharBuffer(srcLen);
 320         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 321         src = bufTmp;
 322     }
 323
 324     const size_t lenNul = GetMBNulLen();
 325     for ( const wchar_t * const srcEnd = src + srcLen;
 326           src < srcEnd;
 327           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 328     {
 329         // try to convert the current chunk
 330         size_t lenChunk = WC2MB(NULL, src, 0);
 331
 332         if ( lenChunk == wxCONV_FAILED )
 333             return wxCONV_FAILED;
 334
 335         dstWritten += lenChunk;
 336         if ( src+lenChunk < srcEnd || isNulTerminated )
 337             dstWritten += lenNul;
 338
 339         if ( dst )
 340         {
 341             if ( dstWritten > dstLen )
 342                 return wxCONV_FAILED;
 343
 344             if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
 345                 return wxCONV_FAILED;
 346
 347             dst += lenChunk;
 348             if ( src+lenChunk < srcEnd || isNulTerminated )
 349                 dst += lenNul;
 350         }
 351     }
 352
 353     return dstWritten;
 354 }
 355
 356 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 357 {
 358     size_t rc = ToWChar(outBuff, outLen, inBuff);
 359     if ( rc != wxCONV_FAILED )
 360     {
 361         // ToWChar() returns the buffer length, i.e. including the trailing
 362         // NUL, while this method doesn't take it into account
 363         rc--;
 364     }
 365
 366     return rc;
 367 }
 368
 369 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 370 {
 371     size_t rc = FromWChar(outBuff, outLen, inBuff);
 372     if ( rc != wxCONV_FAILED )
 373     {
 374         rc -= GetMBNulLen();
 375     }
 376
 377     return rc;
 378 }
 379
 380 wxMBConv::~wxMBConv()
 381 {
 382     // nothing to do here (necessary for Darwin linking probably)
 383 }
 384
 385 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 386 {
 387     if ( psz )
 388     {
 389         // calculate the length of the buffer needed first
 390         const size_t nLen = ToWChar(NULL, 0, psz);
 391         if ( nLen != wxCONV_FAILED )
 392         {
 393             // now do the actual conversion
 394             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 395
 396             // +1 for the trailing NULL
 397             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 398                 return buf;
 399         }
 400     }
 401
 402     return wxWCharBuffer();
 403 }
 404
 405 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 406 {
 407     if ( pwz )
 408     {
 409         const size_t nLen = FromWChar(NULL, 0, pwz);
 410         if ( nLen != wxCONV_FAILED )
 411         {
 412             wxCharBuffer buf(nLen - 1);
 413             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 414                 return buf;
 415         }
 416     }
 417
 418     return wxCharBuffer();
 419 }
 420
 421 const wxWCharBuffer
 422 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 423 {
 424     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 425     if ( dstLen != wxCONV_FAILED )
 426     {
 427         // notice that we allocate space for dstLen+1 wide characters here
 428         // because we want the buffer to always be NUL-terminated, even if the
 429         // input isn't (as otherwise the caller has no way to know its length)
 430         wxWCharBuffer wbuf(dstLen);
 431         wbuf.data()[dstLen] = L'\0';
 432         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 433         {
 434             if ( outLen )
 435             {
 436                 *outLen = dstLen;
 437
 438                 // we also need to handle NUL-terminated input strings
 439                 // specially: for them the output is the length of the string
 440                 // excluding the trailing NUL, however if we're asked to
 441                 // convert a specific number of characters we return the length
 442                 // of the resulting output even if it's NUL-terminated
 443                 if ( inLen == wxNO_LEN )
 444                     (*outLen)--;
 445             }
 446
 447             return wbuf;
 448         }
 449     }
 450
 451     if ( outLen )
 452         *outLen = 0;
 453
 454     return wxWCharBuffer();
 455 }
 456
 457 const wxCharBuffer
 458 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 459 {
 460     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 461     if ( dstLen != wxCONV_FAILED )
 462     {
 463         const size_t nulLen = GetMBNulLen();
 464
 465         // as above, ensure that the buffer is always NUL-terminated, even if
 466         // the input is not
 467         wxCharBuffer buf(dstLen + nulLen - 1);
 468         memset(buf.data() + dstLen, 0, nulLen);
 469         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 470         {
 471             if ( outLen )
 472             {
 473                 *outLen = dstLen;
 474
 475                 if ( inLen == wxNO_LEN )
 476                 {
 477                     // in this case both input and output are NUL-terminated
 478                     // and we're not supposed to count NUL
 479                     *outLen -= nulLen;
 480                 }
 481             }
 482
 483             return buf;
 484         }
 485     }
 486
 487     if ( outLen )
 488         *outLen = 0;
 489
 490     return wxCharBuffer();
 491 }
 492
 493 const wxWCharBuffer wxMBConv::cMB2WC(const wxScopedCharBuffer& buf) const
 494 {
 495     const size_t srcLen = buf.length();
 496     if ( srcLen )
 497     {
 498         const size_t dstLen = ToWChar(NULL, 0, buf, srcLen);
 499         if ( dstLen != wxCONV_FAILED )
 500         {
 501             wxWCharBuffer wbuf(dstLen);
 502             wbuf.data()[dstLen] = L'\0';
 503             if ( ToWChar(wbuf.data(), dstLen, buf, srcLen) != wxCONV_FAILED )
 504                 return wbuf;
 505         }
 506     }
 507
 508     return wxWCharBuffer();
 509 }
 510
 511 const wxCharBuffer wxMBConv::cWC2MB(const wxScopedWCharBuffer& wbuf) const
 512 {
 513     const size_t srcLen = wbuf.length();
 514     if ( srcLen )
 515     {
 516         const size_t dstLen = FromWChar(NULL, 0, wbuf, srcLen);
 517         if ( dstLen != wxCONV_FAILED )
 518         {
 519             wxCharBuffer buf(dstLen);
 520             buf.data()[dstLen] = '\0';
 521             if ( FromWChar(buf.data(), dstLen, wbuf, srcLen) != wxCONV_FAILED )
 522                 return buf;
 523         }
 524     }
 525
 526     return wxCharBuffer();
 527 }
 528
 529 // ----------------------------------------------------------------------------
 530 // wxMBConvLibc
 531 // ----------------------------------------------------------------------------
 532
 533 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 534 {
 535     return wxMB2WC(buf, psz, n);
 536 }
 537
 538 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 539 {
 540     return wxWC2MB(buf, psz, n);
 541 }
 542
 543 // ----------------------------------------------------------------------------
 544 // wxConvBrokenFileNames
 545 // ----------------------------------------------------------------------------
 546
 547 #ifdef __UNIX__
 548
 549 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 550 {
 551     if ( wxStricmp(charset, wxT("UTF-8")) == 0 ||
 552          wxStricmp(charset, wxT("UTF8")) == 0  )
 553         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 554     else
 555         m_conv = new wxCSConv(charset);
 556 }
 557
 558 #endif // __UNIX__
 559
 560 // ----------------------------------------------------------------------------
 561 // UTF-7
 562 // ----------------------------------------------------------------------------
 563
 564 // Implementation (C) 2004 Fredrik Roubert
 565 //
 566 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 567
 568 //
 569 // BASE64 decoding table
 570 //
 571 static const unsigned char utf7unb64[] =
 572 {
 573     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 574     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 575     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 576     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 577     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 578     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 579     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 580     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 581     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 582     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 583     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 584     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 585     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 586     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 587     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 588     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 589     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 590     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 591     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 592     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 593     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 594     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 595     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 596     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 597     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 598     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 599     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 600     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 601     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 602     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 603     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 604     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 605 };
 606
 607 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 608                              const char *src, size_t srcLen) const
 609 {
 610     DecoderState stateOrig,
 611                 *statePtr;
 612     if ( srcLen == wxNO_LEN )
 613     {
 614         // convert the entire string, up to and including the trailing NUL
 615         srcLen = strlen(src) + 1;
 616
 617         // when working on the entire strings we don't update nor use the shift
 618         // state from the previous call
 619         statePtr = &stateOrig;
 620     }
 621     else // when working with partial strings we do use the shift state
 622     {
 623         statePtr = const_cast<DecoderState *>(&m_stateDecoder);
 624
 625         // also save the old state to be able to rollback to it on error
 626         stateOrig = m_stateDecoder;
 627     }
 628
 629     // but to simplify the code below we use this variable in both cases
 630     DecoderState& state = *statePtr;
 631
 632
 633     // number of characters [which would have been] written to dst [if it were
 634     // not NULL]
 635     size_t len = 0;
 636
 637     const char * const srcEnd = src + srcLen;
 638
 639     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 640     {
 641         const unsigned char cc = *src++;
 642
 643         if ( state.IsShifted() )
 644         {
 645             const unsigned char dc = utf7unb64[cc];
 646             if ( dc == 0xff )
 647             {
 648                 // end of encoded part, check that nothing was left: there can
 649                 // be up to 4 bits of 0 padding but nothing else (we also need
 650                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
 651                 // encoded sequence must contain an integral number of UTF-16
 652                 // characters)
 653                 if ( state.isLSB || state.bit > 4 ||
 654                         (state.accum & ((1 << state.bit) - 1)) )
 655                 {
 656                     if ( !len )
 657                         state = stateOrig;
 658
 659                     return wxCONV_FAILED;
 660                 }
 661
 662                 state.ToDirect();
 663
 664                 // re-parse this character normally below unless it's '-' which
 665                 // is consumed by the decoder
 666                 if ( cc == '-' )
 667                     continue;
 668             }
 669             else // valid encoded character
 670             {
 671                 // mini base64 decoder: each character is 6 bits
 672                 state.bit += 6;
 673                 state.accum <<= 6;
 674                 state.accum += dc;
 675
 676                 if ( state.bit >= 8 )
 677                 {
 678                     // got the full byte, consume it
 679                     state.bit -= 8;
 680                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 681
 682                     if ( state.isLSB )
 683                     {
 684                         // we've got the full word, output it
 685                         if ( dst )
 686                             *dst++ = (state.msb << 8) | b;
 687                         len++;
 688                         state.isLSB = false;
 689                     }
 690                     else // MSB
 691                     {
 692                         // just store it while we wait for LSB
 693                         state.msb = b;
 694                         state.isLSB = true;
 695                     }
 696                 }
 697             }
 698         }
 699
 700         if ( state.IsDirect() )
 701         {
 702             // start of an encoded segment?
 703             if ( cc == '+' )
 704             {
 705                 if ( *src == '-' )
 706                 {
 707                     // just the encoded plus sign, don't switch to shifted mode
 708                     if ( dst )
 709                         *dst++ = '+';
 710                     len++;
 711                     src++;
 712                 }
 713                 else if ( utf7unb64[(unsigned)*src] == 0xff )
 714                 {
 715                     // empty encoded chunks are not allowed
 716                     if ( !len )
 717                         state = stateOrig;
 718
 719                     return wxCONV_FAILED;
 720                 }
 721                 else // base-64 encoded chunk follows
 722                 {
 723                     state.ToShifted();
 724                 }
 725             }
 726             else // not '+'
 727             {
 728                 // only printable 7 bit ASCII characters (with the exception of
 729                 // NUL, TAB, CR and LF) can be used directly
 730                 if ( cc >= 0x7f || (cc < ' ' &&
 731                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 732                     return wxCONV_FAILED;
 733
 734                 if ( dst )
 735                     *dst++ = cc;
 736                 len++;
 737             }
 738         }
 739     }
 740
 741     if ( !len )
 742     {
 743         // as we didn't read any characters we should be called with the same
 744         // data (followed by some more new data) again later so don't save our
 745         // state
 746         state = stateOrig;
 747
 748         return wxCONV_FAILED;
 749     }
 750
 751     return len;
 752 }
 753
 754 //
 755 // BASE64 encoding table
 756 //
 757 static const unsigned char utf7enb64[] =
 758 {
 759     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 760     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 761     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 762     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 763     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 764     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 765     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 766     '4', '5', '6', '7', '8', '9', '+', '/'
 767 };
 768
 769 //
 770 // UTF-7 encoding table
 771 //
 772 // 0 - Set D (directly encoded characters)
 773 // 1 - Set O (optional direct characters)
 774 // 2 - whitespace characters (optional)
 775 // 3 - special characters
 776 //
 777 static const unsigned char utf7encode[128] =
 778 {
 779     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 780     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 781     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 782     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 783     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 784     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 785     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 786     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 787 };
 788
 789 static inline bool wxIsUTF7Direct(wchar_t wc)
 790 {
 791     return wc < 0x80 && utf7encode[wc] < 1;
 792 }
 793
 794 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 795                                const wchar_t *src, size_t srcLen) const
 796 {
 797     EncoderState stateOrig,
 798                 *statePtr;
 799     if ( srcLen == wxNO_LEN )
 800     {
 801         // we don't apply the stored state when operating on entire strings at
 802         // once
 803         statePtr = &stateOrig;
 804
 805         srcLen = wxWcslen(src) + 1;
 806     }
 807     else // do use the mode we left the output in previously
 808     {
 809         stateOrig = m_stateEncoder;
 810         statePtr = const_cast<EncoderState *>(&m_stateEncoder);
 811     }
 812
 813     EncoderState& state = *statePtr;
 814
 815
 816     size_t len = 0;
 817
 818     const wchar_t * const srcEnd = src + srcLen;
 819     while ( src < srcEnd && (!dst || len < dstLen) )
 820     {
 821         wchar_t cc = *src++;
 822         if ( wxIsUTF7Direct(cc) )
 823         {
 824             if ( state.IsShifted() )
 825             {
 826                 // pad with zeros the last encoded block if necessary
 827                 if ( state.bit )
 828                 {
 829                     if ( dst )
 830                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 831                     len++;
 832                 }
 833
 834                 state.ToDirect();
 835
 836                 if ( dst )
 837                     *dst++ = '-';
 838                 len++;
 839             }
 840
 841             if ( dst )
 842                 *dst++ = (char)cc;
 843             len++;
 844         }
 845         else if ( cc == '+' && state.IsDirect() )
 846         {
 847             if ( dst )
 848             {
 849                 *dst++ = '+';
 850                 *dst++ = '-';
 851             }
 852
 853             len += 2;
 854         }
 855 #ifndef WC_UTF16
 856         else if (((wxUint32)cc) > 0xffff)
 857         {
 858             // no surrogate pair generation (yet?)
 859             return wxCONV_FAILED;
 860         }
 861 #endif
 862         else
 863         {
 864             if ( state.IsDirect() )
 865             {
 866                 state.ToShifted();
 867
 868                 if ( dst )
 869                     *dst++ = '+';
 870                 len++;
 871             }
 872
 873             // BASE64 encode string
 874             for ( ;; )
 875             {
 876                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 877                 {
 878                     state.accum <<= 8;
 879                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 880
 881                     for (state.bit += 8; state.bit >= 6; )
 882                     {
 883                         state.bit -= 6;
 884                         if ( dst )
 885                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 886                         len++;
 887                     }
 888                 }
 889
 890                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 891                     break;
 892
 893                 src++;
 894             }
 895         }
 896     }
 897
 898     // we need to restore the original encoder state if we were called just to
 899     // calculate the amount of space needed as we will presumably be called
 900     // again to really convert the data now
 901     if ( !dst )
 902         state = stateOrig;
 903
 904     return len;
 905 }
 906
 907 // ----------------------------------------------------------------------------
 908 // UTF-8
 909 // ----------------------------------------------------------------------------
 910
 911 static const wxUint32 utf8_max[]=
 912     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 913
 914 // boundaries of the private use area we use to (temporarily) remap invalid
 915 // characters invalid in a UTF-8 encoded string
 916 const wxUint32 wxUnicodePUA = 0x100000;
 917 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 918
 919 // this table gives the length of the UTF-8 encoding from its first character:
 920 const unsigned char tableUtf8Lengths[256] = {
 921     // single-byte sequences (ASCII):
 922     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 923     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 924     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 925     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 926     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 927     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 928     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 929     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 930
 931     // these are invalid:
 932     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 933     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 934     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 935     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 936     0, 0,                                            // C0,C1
 937
 938     // two-byte sequences:
 939           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 940     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 941
 942     // three-byte sequences:
 943     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 944
 945     // four-byte sequences:
 946     4, 4, 4, 4, 4,                                   // F0..F4
 947
 948     // these are invalid again (5- or 6-byte
 949     // sequences and sequences for code points
 950     // above U+10FFFF, as restricted by RFC 3629):
 951                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 952 };
 953
 954 size_t
 955 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 956                             const char *src, size_t srcLen) const
 957 {
 958     wchar_t *out = dstLen ? dst : NULL;
 959     size_t written = 0;
 960
 961     if ( srcLen == wxNO_LEN )
 962         srcLen = strlen(src) + 1;
 963
 964     for ( const char *p = src; ; p++ )
 965     {
 966         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 967         {
 968             // all done successfully, just add the trailing NULL if we are not
 969             // using explicit length
 970             if ( srcLen == wxNO_LEN )
 971             {
 972                 if ( out )
 973                 {
 974                     if ( !dstLen )
 975                         break;
 976
 977                     *out = L'\0';
 978                 }
 979
 980                 written++;
 981             }
 982
 983             return written;
 984         }
 985
 986         if ( out && !dstLen-- )
 987             break;
 988
 989         wxUint32 code;
 990         unsigned char c = *p;
 991
 992         if ( c < 0x80 )
 993         {
 994             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 995                 break;
 996
 997             if ( srcLen != wxNO_LEN )
 998                 srcLen--;
 999
1000             code = c;
1001         }
1002         else
1003         {
1004             unsigned len = tableUtf8Lengths[c];
1005             if ( !len )
1006                 break;
1007
1008             if ( srcLen < len ) // the test works for wxNO_LEN too
1009                 break;
1010
1011             if ( srcLen != wxNO_LEN )
1012                 srcLen -= len;
1013
1014             //   Char. number range   |        UTF-8 octet sequence
1015             //      (hexadecimal)     |              (binary)
1016             //  ----------------------+----------------------------------------
1017             //  0000 0000 - 0000 007F | 0xxxxxxx
1018             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
1019             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1020             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1021             //
1022             //  Code point value is stored in bits marked with 'x',
1023             //  lowest-order bit of the value on the right side in the diagram
1024             //  above.                                         (from RFC 3629)
1025
1026             // mask to extract lead byte's value ('x' bits above), by sequence
1027             // length:
1028             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
1029
1030             // mask and value of lead byte's most significant bits, by length:
1031             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
1032             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
1033
1034             len--; // it's more convenient to work with 0-based length here
1035
1036             // extract the lead byte's value bits:
1037             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1038                 break;
1039
1040             code = c & leadValueMask[len];
1041
1042             // all remaining bytes, if any, are handled in the same way
1043             // regardless of sequence's length:
1044             for ( ; len; --len )
1045             {
1046                 c = *++p;
1047                 if ( (c & 0xC0) != 0x80 )
1048                     return wxCONV_FAILED;
1049
1050                 code <<= 6;
1051                 code |= c & 0x3F;
1052             }
1053         }
1054
1055 #ifdef WC_UTF16
1056         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1057         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1058         {
1059             if ( out )
1060                 out++;
1061             written++;
1062         }
1063 #else // !WC_UTF16
1064         if ( out )
1065             *out = code;
1066 #endif // WC_UTF16/!WC_UTF16
1067
1068         if ( out )
1069             out++;
1070
1071         written++;
1072     }
1073
1074     return wxCONV_FAILED;
1075 }
1076
1077 size_t
1078 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1079                               const wchar_t *src, size_t srcLen) const
1080 {
1081     char *out = dstLen ? dst : NULL;
1082     size_t written = 0;
1083
1084     for ( const wchar_t *wp = src; ; wp++ )
1085     {
1086         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1087         {
1088             // all done successfully, just add the trailing NULL if we are not
1089             // using explicit length
1090             if ( srcLen == wxNO_LEN )
1091             {
1092                 if ( out )
1093                 {
1094                     if ( !dstLen )
1095                         break;
1096
1097                     *out = '\0';
1098                 }
1099
1100                 written++;
1101             }
1102
1103             return written;
1104         }
1105
1106         if ( srcLen != wxNO_LEN )
1107             srcLen--;
1108
1109         wxUint32 code;
1110 #ifdef WC_UTF16
1111         // cast is ok for WC_UTF16
1112         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1113         {
1114             // skip the next char too as we decoded a surrogate
1115             wp++;
1116         }
1117 #else // wchar_t is UTF-32
1118         code = *wp & 0x7fffffff;
1119 #endif
1120
1121         unsigned len;
1122         if ( code <= 0x7F )
1123         {
1124             len = 1;
1125             if ( out )
1126             {
1127                 if ( dstLen < len )
1128                     break;
1129
1130                 out[0] = (char)code;
1131             }
1132         }
1133         else if ( code <= 0x07FF )
1134         {
1135             len = 2;
1136             if ( out )
1137             {
1138                 if ( dstLen < len )
1139                     break;
1140
1141                 // NB: this line takes 6 least significant bits, encodes them as
1142                 // 10xxxxxx and discards them so that the next byte can be encoded:
1143                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1144                 out[0] = 0xC0 | code;
1145             }
1146         }
1147         else if ( code < 0xFFFF )
1148         {
1149             len = 3;
1150             if ( out )
1151             {
1152                 if ( dstLen < len )
1153                     break;
1154
1155                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1156                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1157                 out[0] = 0xE0 | code;
1158             }
1159         }
1160         else if ( code <= 0x10FFFF )
1161         {
1162             len = 4;
1163             if ( out )
1164             {
1165                 if ( dstLen < len )
1166                     break;
1167
1168                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1169                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1170                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1171                 out[0] = 0xF0 | code;
1172             }
1173         }
1174         else
1175         {
1176             wxFAIL_MSG( wxT("trying to encode undefined Unicode character") );
1177             break;
1178         }
1179
1180         if ( out )
1181         {
1182             out += len;
1183             dstLen -= len;
1184         }
1185
1186         written += len;
1187     }
1188
1189     // we only get here if an error occurs during decoding
1190     return wxCONV_FAILED;
1191 }
1192
1193 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1194                              const char *psz, size_t srcLen) const
1195 {
1196     if ( m_options == MAP_INVALID_UTF8_NOT )
1197         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1198
1199     size_t len = 0;
1200
1201     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1202     {
1203         const char *opsz = psz;
1204         bool invalid = false;
1205         unsigned char cc = *psz++, fc = cc;
1206         unsigned cnt;
1207         for (cnt = 0; fc & 0x80; cnt++)
1208             fc <<= 1;
1209
1210         if (!cnt)
1211         {
1212             // plain ASCII char
1213             if (buf)
1214                 *buf++ = cc;
1215             len++;
1216
1217             // escape the escape character for octal escapes
1218             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1219                     && cc == '\\' && (!buf || len < n))
1220             {
1221                 if (buf)
1222                     *buf++ = cc;
1223                 len++;
1224             }
1225         }
1226         else
1227         {
1228             cnt--;
1229             if (!cnt)
1230             {
1231                 // invalid UTF-8 sequence
1232                 invalid = true;
1233             }
1234             else
1235             {
1236                 unsigned ocnt = cnt - 1;
1237                 wxUint32 res = cc & (0x3f >> cnt);
1238                 while (cnt--)
1239                 {
1240                     cc = *psz;
1241                     if ((cc & 0xC0) != 0x80)
1242                     {
1243                         // invalid UTF-8 sequence
1244                         invalid = true;
1245                         break;
1246                     }
1247
1248                     psz++;
1249                     res = (res << 6) | (cc & 0x3f);
1250                 }
1251
1252                 if (invalid || res <= utf8_max[ocnt])
1253                 {
1254                     // illegal UTF-8 encoding
1255                     invalid = true;
1256                 }
1257                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1258                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1259                 {
1260                     // if one of our PUA characters turns up externally
1261                     // it must also be treated as an illegal sequence
1262                     // (a bit like you have to escape an escape character)
1263                     invalid = true;
1264                 }
1265                 else
1266                 {
1267 #ifdef WC_UTF16
1268                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1269                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1270                     if (pa == wxCONV_FAILED)
1271                     {
1272                         invalid = true;
1273                     }
1274                     else
1275                     {
1276                         if (buf)
1277                             buf += pa;
1278                         len += pa;
1279                     }
1280 #else // !WC_UTF16
1281                     if (buf)
1282                         *buf++ = (wchar_t)res;
1283                     len++;
1284 #endif // WC_UTF16/!WC_UTF16
1285                 }
1286             }
1287
1288             if (invalid)
1289             {
1290                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1291                 {
1292                     while (opsz < psz && (!buf || len < n))
1293                     {
1294 #ifdef WC_UTF16
1295                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1296                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1297                         wxASSERT(pa != wxCONV_FAILED);
1298                         if (buf)
1299                             buf += pa;
1300                         opsz++;
1301                         len += pa;
1302 #else
1303                         if (buf)
1304                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1305                         opsz++;
1306                         len++;
1307 #endif
1308                     }
1309                 }
1310                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1311                 {
1312                     while (opsz < psz && (!buf || len < n))
1313                     {
1314                         if ( buf && len + 3 < n )
1315                         {
1316                             unsigned char on = *opsz;
1317                             *buf++ = L'\\';
1318                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1319                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1320                             *buf++ = (wchar_t)( L'0' + on % 010 );
1321                         }
1322
1323                         opsz++;
1324                         len += 4;
1325                     }
1326                 }
1327                 else // MAP_INVALID_UTF8_NOT
1328                 {
1329                     return wxCONV_FAILED;
1330                 }
1331             }
1332         }
1333     }
1334
1335     if (srcLen == wxNO_LEN && buf && (len < n))
1336         *buf = 0;
1337
1338     return len + 1;
1339 }
1340
1341 static inline bool isoctal(wchar_t wch)
1342 {
1343     return L'0' <= wch && wch <= L'7';
1344 }
1345
1346 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1347                                const wchar_t *psz, size_t srcLen) const
1348 {
1349     if ( m_options == MAP_INVALID_UTF8_NOT )
1350         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1351
1352     size_t len = 0;
1353
1354     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1355     {
1356         wxUint32 cc;
1357
1358 #ifdef WC_UTF16
1359         // cast is ok for WC_UTF16
1360         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1361         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1362 #else
1363         cc = (*psz++) & 0x7fffffff;
1364 #endif
1365
1366         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1367                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1368         {
1369             if (buf)
1370                 *buf++ = (char)(cc - wxUnicodePUA);
1371             len++;
1372         }
1373         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1374                     && cc == L'\\' && psz[0] == L'\\' )
1375         {
1376             if (buf)
1377                 *buf++ = (char)cc;
1378             psz++;
1379             len++;
1380         }
1381         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1382                     cc == L'\\' &&
1383                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1384         {
1385             if (buf)
1386             {
1387                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1388                                  (psz[1] - L'0') * 010 +
1389                                  (psz[2] - L'0'));
1390             }
1391
1392             psz += 3;
1393             len++;
1394         }
1395         else
1396         {
1397             unsigned cnt;
1398             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1399             {
1400             }
1401
1402             if (!cnt)
1403             {
1404                 // plain ASCII char
1405                 if (buf)
1406                     *buf++ = (char) cc;
1407                 len++;
1408             }
1409             else
1410             {
1411                 len += cnt + 1;
1412                 if (buf)
1413                 {
1414                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1415                     while (cnt--)
1416                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1417                 }
1418             }
1419         }
1420     }
1421
1422     if (srcLen == wxNO_LEN && buf && (len < n))
1423         *buf = 0;
1424
1425     return len + 1;
1426 }
1427
1428 // ============================================================================
1429 // UTF-16
1430 // ============================================================================
1431
1432 #ifdef WORDS_BIGENDIAN
1433     #define wxMBConvUTF16straight wxMBConvUTF16BE
1434     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1435 #else
1436     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1437     #define wxMBConvUTF16straight wxMBConvUTF16LE
1438 #endif
1439
1440 /* static */
1441 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1442 {
1443     if ( srcLen == wxNO_LEN )
1444     {
1445         // count the number of bytes in input, including the trailing NULs
1446         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1447         for ( srcLen = 1; *inBuff++; srcLen++ )
1448             ;
1449
1450         srcLen *= BYTES_PER_CHAR;
1451     }
1452     else // we already have the length
1453     {
1454         // we can only convert an entire number of UTF-16 characters
1455         if ( srcLen % BYTES_PER_CHAR )
1456             return wxCONV_FAILED;
1457     }
1458
1459     return srcLen;
1460 }
1461
1462 // case when in-memory representation is UTF-16 too
1463 #ifdef WC_UTF16
1464
1465 // ----------------------------------------------------------------------------
1466 // conversions without endianness change
1467 // ----------------------------------------------------------------------------
1468
1469 size_t
1470 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1471                                const char *src, size_t srcLen) const
1472 {
1473     // set up the scene for using memcpy() (which is presumably more efficient
1474     // than copying the bytes one by one)
1475     srcLen = GetLength(src, srcLen);
1476     if ( srcLen == wxNO_LEN )
1477         return wxCONV_FAILED;
1478
1479     const size_t inLen = srcLen / BYTES_PER_CHAR;
1480     if ( dst )
1481     {
1482         if ( dstLen < inLen )
1483             return wxCONV_FAILED;
1484
1485         memcpy(dst, src, srcLen);
1486     }
1487
1488     return inLen;
1489 }
1490
1491 size_t
1492 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1493                                  const wchar_t *src, size_t srcLen) const
1494 {
1495     if ( srcLen == wxNO_LEN )
1496         srcLen = wxWcslen(src) + 1;
1497
1498     srcLen *= BYTES_PER_CHAR;
1499
1500     if ( dst )
1501     {
1502         if ( dstLen < srcLen )
1503             return wxCONV_FAILED;
1504
1505         memcpy(dst, src, srcLen);
1506     }
1507
1508     return srcLen;
1509 }
1510
1511 // ----------------------------------------------------------------------------
1512 // endian-reversing conversions
1513 // ----------------------------------------------------------------------------
1514
1515 size_t
1516 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1517                            const char *src, size_t srcLen) const
1518 {
1519     srcLen = GetLength(src, srcLen);
1520     if ( srcLen == wxNO_LEN )
1521         return wxCONV_FAILED;
1522
1523     srcLen /= BYTES_PER_CHAR;
1524
1525     if ( dst )
1526     {
1527         if ( dstLen < srcLen )
1528             return wxCONV_FAILED;
1529
1530         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1531         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1532         {
1533             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1534         }
1535     }
1536
1537     return srcLen;
1538 }
1539
1540 size_t
1541 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1542                              const wchar_t *src, size_t srcLen) const
1543 {
1544     if ( srcLen == wxNO_LEN )
1545         srcLen = wxWcslen(src) + 1;
1546
1547     srcLen *= BYTES_PER_CHAR;
1548
1549     if ( dst )
1550     {
1551         if ( dstLen < srcLen )
1552             return wxCONV_FAILED;
1553
1554         wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1555         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1556         {
1557             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1558         }
1559     }
1560
1561     return srcLen;
1562 }
1563
1564 #else // !WC_UTF16: wchar_t is UTF-32
1565
1566 // ----------------------------------------------------------------------------
1567 // conversions without endianness change
1568 // ----------------------------------------------------------------------------
1569
1570 size_t
1571 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1572                                const char *src, size_t srcLen) const
1573 {
1574     srcLen = GetLength(src, srcLen);
1575     if ( srcLen == wxNO_LEN )
1576         return wxCONV_FAILED;
1577
1578     const size_t inLen = srcLen / BYTES_PER_CHAR;
1579     if ( !dst )
1580     {
1581         // optimization: return maximal space which could be needed for this
1582         // string even if the real size could be smaller if the buffer contains
1583         // any surrogates
1584         return inLen;
1585     }
1586
1587     size_t outLen = 0;
1588     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1589     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1590     {
1591         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1592         if ( !inBuff )
1593             return wxCONV_FAILED;
1594
1595         if ( ++outLen > dstLen )
1596             return wxCONV_FAILED;
1597
1598         *dst++ = ch;
1599     }
1600
1601
1602     return outLen;
1603 }
1604
1605 size_t
1606 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1607                                  const wchar_t *src, size_t srcLen) const
1608 {
1609     if ( srcLen == wxNO_LEN )
1610         srcLen = wxWcslen(src) + 1;
1611
1612     size_t outLen = 0;
1613     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1614     for ( size_t n = 0; n < srcLen; n++ )
1615     {
1616         wxUint16 cc[2];
1617         const size_t numChars = encode_utf16(*src++, cc);
1618         if ( numChars == wxCONV_FAILED )
1619             return wxCONV_FAILED;
1620
1621         outLen += numChars * BYTES_PER_CHAR;
1622         if ( outBuff )
1623         {
1624             if ( outLen > dstLen )
1625                 return wxCONV_FAILED;
1626
1627             *outBuff++ = cc[0];
1628             if ( numChars == 2 )
1629             {
1630                 // second character of a surrogate
1631                 *outBuff++ = cc[1];
1632             }
1633         }
1634     }
1635
1636     return outLen;
1637 }
1638
1639 // ----------------------------------------------------------------------------
1640 // endian-reversing conversions
1641 // ----------------------------------------------------------------------------
1642
1643 size_t
1644 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1645                            const char *src, size_t srcLen) const
1646 {
1647     srcLen = GetLength(src, srcLen);
1648     if ( srcLen == wxNO_LEN )
1649         return wxCONV_FAILED;
1650
1651     const size_t inLen = srcLen / BYTES_PER_CHAR;
1652     if ( !dst )
1653     {
1654         // optimization: return maximal space which could be needed for this
1655         // string even if the real size could be smaller if the buffer contains
1656         // any surrogates
1657         return inLen;
1658     }
1659
1660     size_t outLen = 0;
1661     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1662     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1663     {
1664         wxUint32 ch;
1665         wxUint16 tmp[2];
1666
1667         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1668         inBuff++;
1669         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1670
1671         const size_t numChars = decode_utf16(tmp, ch);
1672         if ( numChars == wxCONV_FAILED )
1673             return wxCONV_FAILED;
1674
1675         if ( numChars == 2 )
1676             inBuff++;
1677
1678         if ( ++outLen > dstLen )
1679             return wxCONV_FAILED;
1680
1681         *dst++ = ch;
1682     }
1683
1684
1685     return outLen;
1686 }
1687
1688 size_t
1689 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1690                              const wchar_t *src, size_t srcLen) const
1691 {
1692     if ( srcLen == wxNO_LEN )
1693         srcLen = wxWcslen(src) + 1;
1694
1695     size_t outLen = 0;
1696     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1697     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1698     {
1699         wxUint16 cc[2];
1700         const size_t numChars = encode_utf16(*src, cc);
1701         if ( numChars == wxCONV_FAILED )
1702             return wxCONV_FAILED;
1703
1704         outLen += numChars * BYTES_PER_CHAR;
1705         if ( outBuff )
1706         {
1707             if ( outLen > dstLen )
1708                 return wxCONV_FAILED;
1709
1710             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1711             if ( numChars == 2 )
1712             {
1713                 // second character of a surrogate
1714                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1715             }
1716         }
1717     }
1718
1719     return outLen;
1720 }
1721
1722 #endif // WC_UTF16/!WC_UTF16
1723
1724
1725 // ============================================================================
1726 // UTF-32
1727 // ============================================================================
1728
1729 #ifdef WORDS_BIGENDIAN
1730     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1731     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1732 #else
1733     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1734     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1735 #endif
1736
1737
1738 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1739 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1740
1741 /* static */
1742 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1743 {
1744     if ( srcLen == wxNO_LEN )
1745     {
1746         // count the number of bytes in input, including the trailing NULs
1747         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1748         for ( srcLen = 1; *inBuff++; srcLen++ )
1749             ;
1750
1751         srcLen *= BYTES_PER_CHAR;
1752     }
1753     else // we already have the length
1754     {
1755         // we can only convert an entire number of UTF-32 characters
1756         if ( srcLen % BYTES_PER_CHAR )
1757             return wxCONV_FAILED;
1758     }
1759
1760     return srcLen;
1761 }
1762
1763 // case when in-memory representation is UTF-16
1764 #ifdef WC_UTF16
1765
1766 // ----------------------------------------------------------------------------
1767 // conversions without endianness change
1768 // ----------------------------------------------------------------------------
1769
1770 size_t
1771 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1772                                const char *src, size_t srcLen) const
1773 {
1774     srcLen = GetLength(src, srcLen);
1775     if ( srcLen == wxNO_LEN )
1776         return wxCONV_FAILED;
1777
1778     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1779     const size_t inLen = srcLen / BYTES_PER_CHAR;
1780     size_t outLen = 0;
1781     for ( size_t n = 0; n < inLen; n++ )
1782     {
1783         wxUint16 cc[2];
1784         const size_t numChars = encode_utf16(*inBuff++, cc);
1785         if ( numChars == wxCONV_FAILED )
1786             return wxCONV_FAILED;
1787
1788         outLen += numChars;
1789         if ( dst )
1790         {
1791             if ( outLen > dstLen )
1792                 return wxCONV_FAILED;
1793
1794             *dst++ = cc[0];
1795             if ( numChars == 2 )
1796             {
1797                 // second character of a surrogate
1798                 *dst++ = cc[1];
1799             }
1800         }
1801     }
1802
1803     return outLen;
1804 }
1805
1806 size_t
1807 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1808                                  const wchar_t *src, size_t srcLen) const
1809 {
1810     if ( srcLen == wxNO_LEN )
1811         srcLen = wxWcslen(src) + 1;
1812
1813     if ( !dst )
1814     {
1815         // optimization: return maximal space which could be needed for this
1816         // string instead of the exact amount which could be less if there are
1817         // any surrogates in the input
1818         //
1819         // we consider that surrogates are rare enough to make it worthwhile to
1820         // avoid running the loop below at the cost of slightly extra memory
1821         // consumption
1822         return srcLen * BYTES_PER_CHAR;
1823     }
1824
1825     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1826     size_t outLen = 0;
1827     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1828     {
1829         const wxUint32 ch = wxDecodeSurrogate(&src);
1830         if ( !src )
1831             return wxCONV_FAILED;
1832
1833         outLen += BYTES_PER_CHAR;
1834
1835         if ( outLen > dstLen )
1836             return wxCONV_FAILED;
1837
1838         *outBuff++ = ch;
1839     }
1840
1841     return outLen;
1842 }
1843
1844 // ----------------------------------------------------------------------------
1845 // endian-reversing conversions
1846 // ----------------------------------------------------------------------------
1847
1848 size_t
1849 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1850                            const char *src, size_t srcLen) const
1851 {
1852     srcLen = GetLength(src, srcLen);
1853     if ( srcLen == wxNO_LEN )
1854         return wxCONV_FAILED;
1855
1856     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1857     const size_t inLen = srcLen / BYTES_PER_CHAR;
1858     size_t outLen = 0;
1859     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1860     {
1861         wxUint16 cc[2];
1862         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1863         if ( numChars == wxCONV_FAILED )
1864             return wxCONV_FAILED;
1865
1866         outLen += numChars;
1867         if ( dst )
1868         {
1869             if ( outLen > dstLen )
1870                 return wxCONV_FAILED;
1871
1872             *dst++ = cc[0];
1873             if ( numChars == 2 )
1874             {
1875                 // second character of a surrogate
1876                 *dst++ = cc[1];
1877             }
1878         }
1879     }
1880
1881     return outLen;
1882 }
1883
1884 size_t
1885 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1886                              const wchar_t *src, size_t srcLen) const
1887 {
1888     if ( srcLen == wxNO_LEN )
1889         srcLen = wxWcslen(src) + 1;
1890
1891     if ( !dst )
1892     {
1893         // optimization: return maximal space which could be needed for this
1894         // string instead of the exact amount which could be less if there are
1895         // any surrogates in the input
1896         //
1897         // we consider that surrogates are rare enough to make it worthwhile to
1898         // avoid running the loop below at the cost of slightly extra memory
1899         // consumption
1900         return srcLen*BYTES_PER_CHAR;
1901     }
1902
1903     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1904     size_t outLen = 0;
1905     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1906     {
1907         const wxUint32 ch = wxDecodeSurrogate(&src);
1908         if ( !src )
1909             return wxCONV_FAILED;
1910
1911         outLen += BYTES_PER_CHAR;
1912
1913         if ( outLen > dstLen )
1914             return wxCONV_FAILED;
1915
1916         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1917     }
1918
1919     return outLen;
1920 }
1921
1922 #else // !WC_UTF16: wchar_t is UTF-32
1923
1924 // ----------------------------------------------------------------------------
1925 // conversions without endianness change
1926 // ----------------------------------------------------------------------------
1927
1928 size_t
1929 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1930                                const char *src, size_t srcLen) const
1931 {
1932     // use memcpy() as it should be much faster than hand-written loop
1933     srcLen = GetLength(src, srcLen);
1934     if ( srcLen == wxNO_LEN )
1935         return wxCONV_FAILED;
1936
1937     const size_t inLen = srcLen/BYTES_PER_CHAR;
1938     if ( dst )
1939     {
1940         if ( dstLen < inLen )
1941             return wxCONV_FAILED;
1942
1943         memcpy(dst, src, srcLen);
1944     }
1945
1946     return inLen;
1947 }
1948
1949 size_t
1950 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1951                                  const wchar_t *src, size_t srcLen) const
1952 {
1953     if ( srcLen == wxNO_LEN )
1954         srcLen = wxWcslen(src) + 1;
1955
1956     srcLen *= BYTES_PER_CHAR;
1957
1958     if ( dst )
1959     {
1960         if ( dstLen < srcLen )
1961             return wxCONV_FAILED;
1962
1963         memcpy(dst, src, srcLen);
1964     }
1965
1966     return srcLen;
1967 }
1968
1969 // ----------------------------------------------------------------------------
1970 // endian-reversing conversions
1971 // ----------------------------------------------------------------------------
1972
1973 size_t
1974 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1975                            const char *src, size_t srcLen) const
1976 {
1977     srcLen = GetLength(src, srcLen);
1978     if ( srcLen == wxNO_LEN )
1979         return wxCONV_FAILED;
1980
1981     srcLen /= BYTES_PER_CHAR;
1982
1983     if ( dst )
1984     {
1985         if ( dstLen < srcLen )
1986             return wxCONV_FAILED;
1987
1988         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1989         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1990         {
1991             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1992         }
1993     }
1994
1995     return srcLen;
1996 }
1997
1998 size_t
1999 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
2000                              const wchar_t *src, size_t srcLen) const
2001 {
2002     if ( srcLen == wxNO_LEN )
2003         srcLen = wxWcslen(src) + 1;
2004
2005     srcLen *= BYTES_PER_CHAR;
2006
2007     if ( dst )
2008     {
2009         if ( dstLen < srcLen )
2010             return wxCONV_FAILED;
2011
2012         wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
2013         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
2014         {
2015             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
2016         }
2017     }
2018
2019     return srcLen;
2020 }
2021
2022 #endif // WC_UTF16/!WC_UTF16
2023
2024
2025 // ============================================================================
2026 // The classes doing conversion using the iconv_xxx() functions
2027 // ============================================================================
2028
2029 #ifdef HAVE_ICONV
2030
2031 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
2032 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
2033 //     (unless there's yet another bug in glibc) the only case when iconv()
2034 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
2035 //     left in the input buffer -- when _real_ error occurs,
2036 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2037 //     iconv() failure.
2038 //     [This bug does not appear in glibc 2.2.]
2039 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2040 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2041                                      (errno != E2BIG || bufLeft != 0))
2042 #else
2043 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2044 #endif
2045
2046 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
2047
2048 #define ICONV_T_INVALID ((iconv_t)-1)
2049
2050 #if SIZEOF_WCHAR_T == 4
2051     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2052     #define WC_ENC      wxFONTENCODING_UTF32
2053 #elif SIZEOF_WCHAR_T == 2
2054     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2055     #define WC_ENC      wxFONTENCODING_UTF16
2056 #else // sizeof(wchar_t) != 2 nor 4
2057     // does this ever happen?
2058     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2059 #endif
2060
2061 // ----------------------------------------------------------------------------
2062 // wxMBConv_iconv: encapsulates an iconv character set
2063 // ----------------------------------------------------------------------------
2064
2065 class wxMBConv_iconv : public wxMBConv
2066 {
2067 public:
2068     wxMBConv_iconv(const char *name);
2069     virtual ~wxMBConv_iconv();
2070
2071     // implement base class virtual methods
2072     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2073                            const char *src, size_t srcLen = wxNO_LEN) const;
2074     virtual size_t FromWChar(char *dst, size_t dstLen,
2075                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2076     virtual size_t GetMBNulLen() const;
2077
2078 #if wxUSE_UNICODE_UTF8
2079     virtual bool IsUTF8() const;
2080 #endif
2081
2082     virtual wxMBConv *Clone() const
2083     {
2084         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2085         p->m_minMBCharWidth = m_minMBCharWidth;
2086         return p;
2087     }
2088
2089     bool IsOk() const
2090         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2091
2092 protected:
2093     // the iconv handlers used to translate from multibyte
2094     // to wide char and in the other direction
2095     iconv_t m2w,
2096             w2m;
2097
2098 #if wxUSE_THREADS
2099     // guards access to m2w and w2m objects
2100     wxMutex m_iconvMutex;
2101 #endif
2102
2103 private:
2104     // the name (for iconv_open()) of a wide char charset -- if none is
2105     // available on this machine, it will remain NULL
2106     static wxString ms_wcCharsetName;
2107
2108     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2109     // different endian-ness than the native one
2110     static bool ms_wcNeedsSwap;
2111
2112
2113     // name of the encoding handled by this conversion
2114     wxString m_name;
2115
2116     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2117     // initially
2118     size_t m_minMBCharWidth;
2119 };
2120
2121 // make the constructor available for unit testing
2122 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2123 {
2124     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2125     if ( !result->IsOk() )
2126     {
2127         delete result;
2128         return 0;
2129     }
2130
2131     return result;
2132 }
2133
2134 wxString wxMBConv_iconv::ms_wcCharsetName;
2135 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2136
2137 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2138               : m_name(name)
2139 {
2140     m_minMBCharWidth = 0;
2141
2142     // check for charset that represents wchar_t:
2143     if ( ms_wcCharsetName.empty() )
2144     {
2145         wxLogTrace(TRACE_STRCONV, wxT("Looking for wide char codeset:"));
2146
2147 #if wxUSE_FONTMAP
2148         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2149 #else // !wxUSE_FONTMAP
2150         static const wxChar *names_static[] =
2151         {
2152 #if SIZEOF_WCHAR_T == 4
2153             wxT("UCS-4"),
2154 #elif SIZEOF_WCHAR_T = 2
2155             wxT("UCS-2"),
2156 #endif
2157             NULL
2158         };
2159         const wxChar **names = names_static;
2160 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2161
2162         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2163         {
2164             const wxString nameCS(*names);
2165
2166             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2167             wxString nameXE(nameCS);
2168
2169 #ifdef WORDS_BIGENDIAN
2170                 nameXE += wxT("BE");
2171 #else // little endian
2172                 nameXE += wxT("LE");
2173 #endif
2174
2175             wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2176                        nameXE.c_str());
2177
2178             m2w = iconv_open(nameXE.ToAscii(), name);
2179             if ( m2w == ICONV_T_INVALID )
2180             {
2181                 // try charset w/o bytesex info (e.g. "UCS4")
2182                 wxLogTrace(TRACE_STRCONV, wxT("  trying charset \"%s\""),
2183                            nameCS.c_str());
2184                 m2w = iconv_open(nameCS.ToAscii(), name);
2185
2186                 // and check for bytesex ourselves:
2187                 if ( m2w != ICONV_T_INVALID )
2188                 {
2189                     char    buf[2], *bufPtr;
2190                     wchar_t wbuf[2];
2191                     size_t  insz, outsz;
2192                     size_t  res;
2193
2194                     buf[0] = 'A';
2195                     buf[1] = 0;
2196                     wbuf[0] = 0;
2197                     insz = 2;
2198                     outsz = SIZEOF_WCHAR_T * 2;
2199                     char* wbufPtr = (char*)wbuf;
2200                     bufPtr = buf;
2201
2202                     res = iconv(
2203                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2204                         &wbufPtr, &outsz);
2205
2206                     if (ICONV_FAILED(res, insz))
2207                     {
2208                         wxLogLastError(wxT("iconv"));
2209                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2210                                    nameCS.c_str());
2211                     }
2212                     else // ok, can convert to this encoding, remember it
2213                     {
2214                         ms_wcCharsetName = nameCS;
2215                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2216                     }
2217                 }
2218             }
2219             else // use charset not requiring byte swapping
2220             {
2221                 ms_wcCharsetName = nameXE;
2222             }
2223         }
2224
2225         wxLogTrace(TRACE_STRCONV,
2226                    wxT("iconv wchar_t charset is \"%s\"%s"),
2227                    ms_wcCharsetName.empty() ? wxString("<none>")
2228                                             : ms_wcCharsetName,
2229                    ms_wcNeedsSwap ? wxT(" (needs swap)")
2230                                   : wxT(""));
2231     }
2232     else // we already have ms_wcCharsetName
2233     {
2234         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2235     }
2236
2237     if ( ms_wcCharsetName.empty() )
2238     {
2239         w2m = ICONV_T_INVALID;
2240     }
2241     else
2242     {
2243         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2244         if ( w2m == ICONV_T_INVALID )
2245         {
2246             wxLogTrace(TRACE_STRCONV,
2247                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2248                        ms_wcCharsetName.c_str(), name);
2249         }
2250     }
2251 }
2252
2253 wxMBConv_iconv::~wxMBConv_iconv()
2254 {
2255     if ( m2w != ICONV_T_INVALID )
2256         iconv_close(m2w);
2257     if ( w2m != ICONV_T_INVALID )
2258         iconv_close(w2m);
2259 }
2260
2261 size_t
2262 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2263                         const char *src, size_t srcLen) const
2264 {
2265     if ( srcLen == wxNO_LEN )
2266     {
2267         // find the string length: notice that must be done differently for
2268         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2269         // consecutive NULs
2270         const size_t nulLen = GetMBNulLen();
2271         switch ( nulLen )
2272         {
2273             default:
2274                 return wxCONV_FAILED;
2275
2276             case 1:
2277                 srcLen = strlen(src); // arguably more optimized than our version
2278                 break;
2279
2280             case 2:
2281             case 4:
2282                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2283                 // but they also have to start at character boundary and not
2284                 // span two adjacent characters
2285                 const char *p;
2286                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2287                     ;
2288                 srcLen = p - src;
2289                 break;
2290         }
2291
2292         // when we're determining the length of the string ourselves we count
2293         // the terminating NUL(s) as part of it and always NUL-terminate the
2294         // output
2295         srcLen += nulLen;
2296     }
2297
2298     // we express length in the number of (wide) characters but iconv always
2299     // counts buffer sizes it in bytes
2300     dstLen *= SIZEOF_WCHAR_T;
2301
2302 #if wxUSE_THREADS
2303     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2304     //     Unfortunately there are a couple of global wxCSConv objects such as
2305     //     wxConvLocal that are used all over wx code, so we have to make sure
2306     //     the handle is used by at most one thread at the time. Otherwise
2307     //     only a few wx classes would be safe to use from non-main threads
2308     //     as MB<->WC conversion would fail "randomly".
2309     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2310 #endif // wxUSE_THREADS
2311
2312     size_t res, cres;
2313     const char *pszPtr = src;
2314
2315     if ( dst )
2316     {
2317         char* bufPtr = (char*)dst;
2318
2319         // have destination buffer, convert there
2320         size_t dstLenOrig = dstLen;
2321         cres = iconv(m2w,
2322                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2323                      &bufPtr, &dstLen);
2324
2325         // convert the number of bytes converted as returned by iconv to the
2326         // number of (wide) characters converted that we need
2327         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2328
2329         if (ms_wcNeedsSwap)
2330         {
2331             // convert to native endianness
2332             for ( unsigned i = 0; i < res; i++ )
2333                 dst[i] = WC_BSWAP(dst[i]);
2334         }
2335     }
2336     else // no destination buffer
2337     {
2338         // convert using temp buffer to calculate the size of the buffer needed
2339         wchar_t tbuf[256];
2340         res = 0;
2341
2342         do
2343         {
2344             char* bufPtr = (char*)tbuf;
2345             dstLen = 8 * SIZEOF_WCHAR_T;
2346
2347             cres = iconv(m2w,
2348                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2349                          &bufPtr, &dstLen );
2350
2351             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2352         }
2353         while ((cres == (size_t)-1) && (errno == E2BIG));
2354     }
2355
2356     if (ICONV_FAILED(cres, srcLen))
2357     {
2358         //VS: it is ok if iconv fails, hence trace only
2359         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2360         return wxCONV_FAILED;
2361     }
2362
2363     return res;
2364 }
2365
2366 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2367                                  const wchar_t *src, size_t srcLen) const
2368 {
2369 #if wxUSE_THREADS
2370     // NB: explained in MB2WC
2371     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2372 #endif
2373
2374     if ( srcLen == wxNO_LEN )
2375         srcLen = wxWcslen(src) + 1;
2376
2377     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2378     size_t outbuflen = dstLen;
2379     size_t res, cres;
2380
2381     wchar_t *tmpbuf = 0;
2382
2383     if (ms_wcNeedsSwap)
2384     {
2385         // need to copy to temp buffer to switch endianness
2386         // (doing WC_BSWAP twice on the original buffer won't work, as it
2387         //  could be in read-only memory, or be accessed in some other thread)
2388         tmpbuf = (wchar_t *)malloc(inbuflen);
2389         for ( size_t i = 0; i < srcLen; i++ )
2390             tmpbuf[i] = WC_BSWAP(src[i]);
2391
2392         src = tmpbuf;
2393     }
2394
2395     char* inbuf = (char*)src;
2396     if ( dst )
2397     {
2398         // have destination buffer, convert there
2399         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2400
2401         res = dstLen - outbuflen;
2402     }
2403     else // no destination buffer
2404     {
2405         // convert using temp buffer to calculate the size of the buffer needed
2406         char tbuf[256];
2407         res = 0;
2408         do
2409         {
2410             dst = tbuf;
2411             outbuflen = WXSIZEOF(tbuf);
2412
2413             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2414
2415             res += WXSIZEOF(tbuf) - outbuflen;
2416         }
2417         while ((cres == (size_t)-1) && (errno == E2BIG));
2418     }
2419
2420     if (ms_wcNeedsSwap)
2421     {
2422         free(tmpbuf);
2423     }
2424
2425     if (ICONV_FAILED(cres, inbuflen))
2426     {
2427         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2428         return wxCONV_FAILED;
2429     }
2430
2431     return res;
2432 }
2433
2434 size_t wxMBConv_iconv::GetMBNulLen() const
2435 {
2436     if ( m_minMBCharWidth == 0 )
2437     {
2438         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2439
2440 #if wxUSE_THREADS
2441         // NB: explained in MB2WC
2442         wxMutexLocker lock(self->m_iconvMutex);
2443 #endif
2444
2445         const wchar_t *wnul = L"";
2446         char buf[8]; // should be enough for NUL in any encoding
2447         size_t inLen = sizeof(wchar_t),
2448                outLen = WXSIZEOF(buf);
2449         char *inBuff = (char *)wnul;
2450         char *outBuff = buf;
2451         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2452         {
2453             self->m_minMBCharWidth = (size_t)-1;
2454         }
2455         else // ok
2456         {
2457             self->m_minMBCharWidth = outBuff - buf;
2458         }
2459     }
2460
2461     return m_minMBCharWidth;
2462 }
2463
2464 #if wxUSE_UNICODE_UTF8
2465 bool wxMBConv_iconv::IsUTF8() const
2466 {
2467     return wxStricmp(m_name, "UTF-8") == 0 ||
2468            wxStricmp(m_name, "UTF8") == 0;
2469 }
2470 #endif
2471
2472 #endif // HAVE_ICONV
2473
2474
2475 // ============================================================================
2476 // Win32 conversion classes
2477 // ============================================================================
2478
2479 #ifdef wxHAVE_WIN32_MB2WC
2480
2481 // from utils.cpp
2482 #if wxUSE_FONTMAP
2483 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2484 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2485 #endif
2486
2487 class wxMBConv_win32 : public wxMBConv
2488 {
2489 public:
2490     wxMBConv_win32()
2491     {
2492         m_CodePage = CP_ACP;
2493         m_minMBCharWidth = 0;
2494     }
2495
2496     wxMBConv_win32(const wxMBConv_win32& conv)
2497         : wxMBConv()
2498     {
2499         m_CodePage = conv.m_CodePage;
2500         m_minMBCharWidth = conv.m_minMBCharWidth;
2501     }
2502
2503 #if wxUSE_FONTMAP
2504     wxMBConv_win32(const char* name)
2505     {
2506         m_CodePage = wxCharsetToCodepage(name);
2507         m_minMBCharWidth = 0;
2508     }
2509
2510     wxMBConv_win32(wxFontEncoding encoding)
2511     {
2512         m_CodePage = wxEncodingToCodepage(encoding);
2513         m_minMBCharWidth = 0;
2514     }
2515 #endif // wxUSE_FONTMAP
2516
2517     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2518     {
2519         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2520         // the behaviour is not compatible with the Unix version (using iconv)
2521         // and break the library itself, e.g. wxTextInputStream::NextChar()
2522         // wouldn't work if reading an incomplete MB char didn't result in an
2523         // error
2524         //
2525         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2526         // Win XP or newer and it is not supported for UTF-[78] so we always
2527         // use our own conversions in this case. See
2528         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2529         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2530         if ( m_CodePage == CP_UTF8 )
2531         {
2532             return wxMBConvUTF8().MB2WC(buf, psz, n);
2533         }
2534
2535         if ( m_CodePage == CP_UTF7 )
2536         {
2537             return wxMBConvUTF7().MB2WC(buf, psz, n);
2538         }
2539
2540         int flags = 0;
2541         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2542                 IsAtLeastWin2kSP4() )
2543         {
2544             flags = MB_ERR_INVALID_CHARS;
2545         }
2546
2547         const size_t len = ::MultiByteToWideChar
2548                              (
2549                                 m_CodePage,     // code page
2550                                 flags,          // flags: fall on error
2551                                 psz,            // input string
2552                                 -1,             // its length (NUL-terminated)
2553                                 buf,            // output string
2554                                 buf ? n : 0     // size of output buffer
2555                              );
2556         if ( !len )
2557         {
2558             // function totally failed
2559             return wxCONV_FAILED;
2560         }
2561
2562         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2563         // check if we succeeded, by doing a double trip:
2564         if ( !flags && buf )
2565         {
2566             const size_t mbLen = strlen(psz);
2567             wxCharBuffer mbBuf(mbLen);
2568             if ( ::WideCharToMultiByte
2569                    (
2570                       m_CodePage,
2571                       0,
2572                       buf,
2573                       -1,
2574                       mbBuf.data(),
2575                       mbLen + 1,        // size in bytes, not length
2576                       NULL,
2577                       NULL
2578                    ) == 0 ||
2579                   strcmp(mbBuf, psz) != 0 )
2580             {
2581                 // we didn't obtain the same thing we started from, hence
2582                 // the conversion was lossy and we consider that it failed
2583                 return wxCONV_FAILED;
2584             }
2585         }
2586
2587         // note that it returns count of written chars for buf != NULL and size
2588         // of the needed buffer for buf == NULL so in either case the length of
2589         // the string (which never includes the terminating NUL) is one less
2590         return len - 1;
2591     }
2592
2593     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2594     {
2595         /*
2596             we have a problem here: by default, WideCharToMultiByte() may
2597             replace characters unrepresentable in the target code page with bad
2598             quality approximations such as turning "1/2" symbol (U+00BD) into
2599             "1" for the code pages which don't have it and we, obviously, want
2600             to avoid this at any price
2601
2602             the trouble is that this function does it _silently_, i.e. it won't
2603             even tell us whether it did or not... Win98/2000 and higher provide
2604             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2605             we have to resort to a round trip, i.e. check that converting back
2606             results in the same string -- this is, of course, expensive but
2607             otherwise we simply can't be sure to not garble the data.
2608          */
2609
2610         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2611         // it doesn't work with CJK encodings (which we test for rather roughly
2612         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2613         // supporting it
2614         BOOL usedDef wxDUMMY_INITIALIZE(false);
2615         BOOL *pUsedDef;
2616         int flags;
2617         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2618         {
2619             // it's our lucky day
2620             flags = WC_NO_BEST_FIT_CHARS;
2621             pUsedDef = &usedDef;
2622         }
2623         else // old system or unsupported encoding
2624         {
2625             flags = 0;
2626             pUsedDef = NULL;
2627         }
2628
2629         const size_t len = ::WideCharToMultiByte
2630                              (
2631                                 m_CodePage,     // code page
2632                                 flags,          // either none or no best fit
2633                                 pwz,            // input string
2634                                 -1,             // it is (wide) NUL-terminated
2635                                 buf,            // output buffer
2636                                 buf ? n : 0,    // and its size
2637                                 NULL,           // default "replacement" char
2638                                 pUsedDef        // [out] was it used?
2639                              );
2640
2641         if ( !len )
2642         {
2643             // function totally failed
2644             return wxCONV_FAILED;
2645         }
2646
2647         // we did something, check if we really succeeded
2648         if ( flags )
2649         {
2650             // check if the conversion failed, i.e. if any replacements
2651             // were done
2652             if ( usedDef )
2653                 return wxCONV_FAILED;
2654         }
2655         else // we must resort to double tripping...
2656         {
2657             // first we need to ensure that we really have the MB data: this is
2658             // not the case if we're called with NULL buffer, in which case we
2659             // need to do the conversion yet again
2660             wxCharBuffer bufDef;
2661             if ( !buf )
2662             {
2663                 bufDef = wxCharBuffer(len);
2664                 buf = bufDef.data();
2665                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2666                                             buf, len, NULL, NULL) )
2667                     return wxCONV_FAILED;
2668             }
2669
2670             if ( !n )
2671                 n = wcslen(pwz);
2672             wxWCharBuffer wcBuf(n);
2673             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2674                     wcscmp(wcBuf, pwz) != 0 )
2675             {
2676                 // we didn't obtain the same thing we started from, hence
2677                 // the conversion was lossy and we consider that it failed
2678                 return wxCONV_FAILED;
2679             }
2680         }
2681
2682         // see the comment above for the reason of "len - 1"
2683         return len - 1;
2684     }
2685
2686     virtual size_t GetMBNulLen() const
2687     {
2688         if ( m_minMBCharWidth == 0 )
2689         {
2690             int len = ::WideCharToMultiByte
2691                         (
2692                             m_CodePage,     // code page
2693                             0,              // no flags
2694                             L"",            // input string
2695                             1,              // translate just the NUL
2696                             NULL,           // output buffer
2697                             0,              // and its size
2698                             NULL,           // no replacement char
2699                             NULL            // [out] don't care if it was used
2700                         );
2701
2702             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2703             switch ( len )
2704             {
2705                 default:
2706                     wxLogDebug(wxT("Unexpected NUL length %d"), len);
2707                     self->m_minMBCharWidth = (size_t)-1;
2708                     break;
2709
2710                 case 0:
2711                     self->m_minMBCharWidth = (size_t)-1;
2712                     break;
2713
2714                 case 1:
2715                 case 2:
2716                 case 4:
2717                     self->m_minMBCharWidth = len;
2718                     break;
2719             }
2720         }
2721
2722         return m_minMBCharWidth;
2723     }
2724
2725     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2726
2727     bool IsOk() const { return m_CodePage != -1; }
2728
2729 private:
2730     static bool CanUseNoBestFit()
2731     {
2732         static int s_isWin98Or2k = -1;
2733
2734         if ( s_isWin98Or2k == -1 )
2735         {
2736             int verMaj, verMin;
2737             switch ( wxGetOsVersion(&verMaj, &verMin) )
2738             {
2739                 case wxOS_WINDOWS_9X:
2740                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2741                     break;
2742
2743                 case wxOS_WINDOWS_NT:
2744                     s_isWin98Or2k = verMaj >= 5;
2745                     break;
2746
2747                 default:
2748                     // unknown: be conservative by default
2749                     s_isWin98Or2k = 0;
2750                     break;
2751             }
2752
2753             wxASSERT_MSG( s_isWin98Or2k != -1, wxT("should be set above") );
2754         }
2755
2756         return s_isWin98Or2k == 1;
2757     }
2758
2759     static bool IsAtLeastWin2kSP4()
2760     {
2761 #ifdef __WXWINCE__
2762         return false;
2763 #else
2764         static int s_isAtLeastWin2kSP4 = -1;
2765
2766         if ( s_isAtLeastWin2kSP4 == -1 )
2767         {
2768             OSVERSIONINFOEX ver;
2769
2770             memset(&ver, 0, sizeof(ver));
2771             ver.dwOSVersionInfoSize = sizeof(ver);
2772             GetVersionEx((OSVERSIONINFO*)&ver);
2773
2774             s_isAtLeastWin2kSP4 =
2775               ((ver.dwMajorVersion > 5) || // Vista+
2776                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2777                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2778                ver.wServicePackMajor >= 4)) // 2000 SP4+
2779               ? 1 : 0;
2780         }
2781
2782         return s_isAtLeastWin2kSP4 == 1;
2783 #endif
2784     }
2785
2786
2787     // the code page we're working with
2788     long m_CodePage;
2789
2790     // cached result of GetMBNulLen(), set to 0 initially meaning
2791     // "unknown"
2792     size_t m_minMBCharWidth;
2793 };
2794
2795 #endif // wxHAVE_WIN32_MB2WC
2796
2797
2798 // ============================================================================
2799 // wxEncodingConverter based conversion classes
2800 // ============================================================================
2801
2802 #if wxUSE_FONTMAP
2803
2804 class wxMBConv_wxwin : public wxMBConv
2805 {
2806 private:
2807     void Init()
2808     {
2809         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2810         // The wxMBConv_cf class does a better job.
2811         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2812                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2813                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2814     }
2815
2816 public:
2817     // temporarily just use wxEncodingConverter stuff,
2818     // so that it works while a better implementation is built
2819     wxMBConv_wxwin(const char* name)
2820     {
2821         if (name)
2822             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2823         else
2824             m_enc = wxFONTENCODING_SYSTEM;
2825
2826         Init();
2827     }
2828
2829     wxMBConv_wxwin(wxFontEncoding enc)
2830     {
2831         m_enc = enc;
2832
2833         Init();
2834     }
2835
2836     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2837     {
2838         size_t inbuf = strlen(psz);
2839         if (buf)
2840         {
2841             if (!m2w.Convert(psz, buf))
2842                 return wxCONV_FAILED;
2843         }
2844         return inbuf;
2845     }
2846
2847     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2848     {
2849         const size_t inbuf = wxWcslen(psz);
2850         if (buf)
2851         {
2852             if (!w2m.Convert(psz, buf))
2853                 return wxCONV_FAILED;
2854         }
2855
2856         return inbuf;
2857     }
2858
2859     virtual size_t GetMBNulLen() const
2860     {
2861         switch ( m_enc )
2862         {
2863             case wxFONTENCODING_UTF16BE:
2864             case wxFONTENCODING_UTF16LE:
2865                 return 2;
2866
2867             case wxFONTENCODING_UTF32BE:
2868             case wxFONTENCODING_UTF32LE:
2869                 return 4;
2870
2871             default:
2872                 return 1;
2873         }
2874     }
2875
2876     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2877
2878     bool IsOk() const { return m_ok; }
2879
2880 public:
2881     wxFontEncoding m_enc;
2882     wxEncodingConverter m2w, w2m;
2883
2884 private:
2885     // were we initialized successfully?
2886     bool m_ok;
2887
2888     wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2889 };
2890
2891 // make the constructors available for unit testing
2892 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2893 {
2894     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2895     if ( !result->IsOk() )
2896     {
2897         delete result;
2898         return 0;
2899     }
2900
2901     return result;
2902 }
2903
2904 #endif // wxUSE_FONTMAP
2905
2906 // ============================================================================
2907 // wxCSConv implementation
2908 // ============================================================================
2909
2910 void wxCSConv::Init()
2911 {
2912     m_name = NULL;
2913     m_convReal =  NULL;
2914     m_deferred = true;
2915 }
2916
2917 wxCSConv::wxCSConv(const wxString& charset)
2918 {
2919     Init();
2920
2921     if ( !charset.empty() )
2922     {
2923         SetName(charset.ToAscii());
2924     }
2925
2926 #if wxUSE_FONTMAP
2927     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2928     if ( m_encoding == wxFONTENCODING_MAX )
2929     {
2930         // set to unknown/invalid value
2931         m_encoding = wxFONTENCODING_SYSTEM;
2932     }
2933     else if ( m_encoding == wxFONTENCODING_DEFAULT )
2934     {
2935         // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2936         m_encoding = wxFONTENCODING_ISO8859_1;
2937     }
2938 #else
2939     m_encoding = wxFONTENCODING_SYSTEM;
2940 #endif
2941 }
2942
2943 wxCSConv::wxCSConv(wxFontEncoding encoding)
2944 {
2945     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2946     {
2947         wxFAIL_MSG( wxT("invalid encoding value in wxCSConv ctor") );
2948
2949         encoding = wxFONTENCODING_SYSTEM;
2950     }
2951
2952     Init();
2953
2954     m_encoding = encoding;
2955 }
2956
2957 wxCSConv::~wxCSConv()
2958 {
2959     Clear();
2960 }
2961
2962 wxCSConv::wxCSConv(const wxCSConv& conv)
2963         : wxMBConv()
2964 {
2965     Init();
2966
2967     SetName(conv.m_name);
2968     m_encoding = conv.m_encoding;
2969 }
2970
2971 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2972 {
2973     Clear();
2974
2975     SetName(conv.m_name);
2976     m_encoding = conv.m_encoding;
2977
2978     return *this;
2979 }
2980
2981 void wxCSConv::Clear()
2982 {
2983     free(m_name);
2984     delete m_convReal;
2985
2986     m_name = NULL;
2987     m_convReal = NULL;
2988 }
2989
2990 void wxCSConv::SetName(const char *charset)
2991 {
2992     if (charset)
2993     {
2994         m_name = wxStrdup(charset);
2995         m_deferred = true;
2996     }
2997 }
2998
2999 #if wxUSE_FONTMAP
3000
3001 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
3002                      wxEncodingNameCache );
3003
3004 static wxEncodingNameCache gs_nameCache;
3005 #endif
3006
3007 wxMBConv *wxCSConv::DoCreate() const
3008 {
3009 #if wxUSE_FONTMAP
3010     wxLogTrace(TRACE_STRCONV,
3011                wxT("creating conversion for %s"),
3012                (m_name ? m_name
3013                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
3014 #endif // wxUSE_FONTMAP
3015
3016     // check for the special case of ASCII or ISO8859-1 charset: as we have
3017     // special knowledge of it anyhow, we don't need to create a special
3018     // conversion object
3019     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
3020             m_encoding == wxFONTENCODING_DEFAULT )
3021     {
3022         // don't convert at all
3023         return NULL;
3024     }
3025
3026     // we trust OS to do conversion better than we can so try external
3027     // conversion methods first
3028     //
3029     // the full order is:
3030     //      1. OS conversion (iconv() under Unix or Win32 API)
3031     //      2. hard coded conversions for UTF
3032     //      3. wxEncodingConverter as fall back
3033
3034     // step (1)
3035 #ifdef HAVE_ICONV
3036 #if !wxUSE_FONTMAP
3037     if ( m_name )
3038 #endif // !wxUSE_FONTMAP
3039     {
3040 #if wxUSE_FONTMAP
3041         wxFontEncoding encoding(m_encoding);
3042 #endif
3043
3044         if ( m_name )
3045         {
3046             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3047             if ( conv->IsOk() )
3048                 return conv;
3049
3050             delete conv;
3051
3052 #if wxUSE_FONTMAP
3053             encoding =
3054                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3055 #endif // wxUSE_FONTMAP
3056         }
3057 #if wxUSE_FONTMAP
3058         {
3059             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3060             if ( it != gs_nameCache.end() )
3061             {
3062                 if ( it->second.empty() )
3063                     return NULL;
3064
3065                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3066                 if ( conv->IsOk() )
3067                     return conv;
3068
3069                 delete conv;
3070             }
3071
3072             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3073             // CS : in case this does not return valid names (eg for MacRoman)
3074             // encoding got a 'failure' entry in the cache all the same,
3075             // although it just has to be created using a different method, so
3076             // only store failed iconv creation attempts (or perhaps we
3077             // shoulnd't do this at all ?)
3078             if ( names[0] != NULL )
3079             {
3080                 for ( ; *names; ++names )
3081                 {
3082                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3083                     //             will need changes that will obsolete this
3084                     wxString name(*names);
3085                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3086                     if ( conv->IsOk() )
3087                     {
3088                         gs_nameCache[encoding] = *names;
3089                         return conv;
3090                     }
3091
3092                     delete conv;
3093                 }
3094
3095                 gs_nameCache[encoding] = wxT(""); // cache the failure
3096             }
3097         }
3098 #endif // wxUSE_FONTMAP
3099     }
3100 #endif // HAVE_ICONV
3101
3102 #ifdef wxHAVE_WIN32_MB2WC
3103     {
3104 #if wxUSE_FONTMAP
3105         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3106                                       : new wxMBConv_win32(m_encoding);
3107         if ( conv->IsOk() )
3108             return conv;
3109
3110         delete conv;
3111 #else
3112         return NULL;
3113 #endif
3114     }
3115 #endif // wxHAVE_WIN32_MB2WC
3116
3117 #ifdef __DARWIN__
3118     {
3119         // leave UTF16 and UTF32 to the built-ins of wx
3120         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3121             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3122         {
3123 #if wxUSE_FONTMAP
3124             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3125                                           : new wxMBConv_cf(m_encoding);
3126 #else
3127             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3128 #endif
3129
3130             if ( conv->IsOk() )
3131                  return conv;
3132
3133             delete conv;
3134         }
3135     }
3136 #endif // __DARWIN__
3137
3138     // step (2)
3139     wxFontEncoding enc = m_encoding;
3140 #if wxUSE_FONTMAP
3141     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3142     {
3143         // use "false" to suppress interactive dialogs -- we can be called from
3144         // anywhere and popping up a dialog from here is the last thing we want to
3145         // do
3146         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3147     }
3148 #endif // wxUSE_FONTMAP
3149
3150     switch ( enc )
3151     {
3152         case wxFONTENCODING_UTF7:
3153              return new wxMBConvUTF7;
3154
3155         case wxFONTENCODING_UTF8:
3156              return new wxMBConvUTF8;
3157
3158         case wxFONTENCODING_UTF16BE:
3159              return new wxMBConvUTF16BE;
3160
3161         case wxFONTENCODING_UTF16LE:
3162              return new wxMBConvUTF16LE;
3163
3164         case wxFONTENCODING_UTF32BE:
3165              return new wxMBConvUTF32BE;
3166
3167         case wxFONTENCODING_UTF32LE:
3168              return new wxMBConvUTF32LE;
3169
3170         default:
3171              // nothing to do but put here to suppress gcc warnings
3172              break;
3173     }
3174
3175     // step (3)
3176 #if wxUSE_FONTMAP
3177     {
3178         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3179                                       : new wxMBConv_wxwin(m_encoding);
3180         if ( conv->IsOk() )
3181             return conv;
3182
3183         delete conv;
3184     }
3185
3186     wxLogTrace(TRACE_STRCONV,
3187                wxT("encoding \"%s\" is not supported by this system"),
3188                (m_name ? wxString(m_name)
3189                        : wxFontMapperBase::GetEncodingName(m_encoding)));
3190 #endif // wxUSE_FONTMAP
3191
3192     return NULL;
3193 }
3194
3195 void wxCSConv::CreateConvIfNeeded() const
3196 {
3197     if ( m_deferred )
3198     {
3199         wxCSConv *self = (wxCSConv *)this; // const_cast
3200
3201         // if we don't have neither the name nor the encoding, use the default
3202         // encoding for this system
3203         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3204         {
3205 #if wxUSE_INTL
3206             self->m_encoding = wxLocale::GetSystemEncoding();
3207 #else
3208             // fallback to some reasonable default:
3209             self->m_encoding = wxFONTENCODING_ISO8859_1;
3210 #endif // wxUSE_INTL
3211         }
3212
3213         self->m_convReal = DoCreate();
3214         self->m_deferred = false;
3215     }
3216 }
3217
3218 bool wxCSConv::IsOk() const
3219 {
3220     CreateConvIfNeeded();
3221
3222     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3223     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3224         return true; // always ok as we do it ourselves
3225
3226     // m_convReal->IsOk() is called at its own creation, so we know it must
3227     // be ok if m_convReal is non-NULL
3228     return m_convReal != NULL;
3229 }
3230
3231 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3232                          const char *src, size_t srcLen) const
3233 {
3234     CreateConvIfNeeded();
3235
3236     if (m_convReal)
3237         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3238
3239     // latin-1 (direct)
3240     if ( srcLen == wxNO_LEN )
3241         srcLen = strlen(src) + 1; // take trailing NUL too
3242
3243     if ( dst )
3244     {
3245         if ( dstLen < srcLen )
3246             return wxCONV_FAILED;
3247
3248         for ( size_t n = 0; n < srcLen; n++ )
3249             dst[n] = (unsigned char)(src[n]);
3250     }
3251
3252     return srcLen;
3253 }
3254
3255 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3256                            const wchar_t *src, size_t srcLen) const
3257 {
3258     CreateConvIfNeeded();
3259
3260     if (m_convReal)
3261         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3262
3263     // latin-1 (direct)
3264     if ( srcLen == wxNO_LEN )
3265         srcLen = wxWcslen(src) + 1;
3266
3267     if ( dst )
3268     {
3269         if ( dstLen < srcLen )
3270             return wxCONV_FAILED;
3271
3272         for ( size_t n = 0; n < srcLen; n++ )
3273         {
3274             if ( src[n] > 0xFF )
3275                 return wxCONV_FAILED;
3276
3277             dst[n] = (char)src[n];
3278         }
3279
3280     }
3281     else // still need to check the input validity
3282     {
3283         for ( size_t n = 0; n < srcLen; n++ )
3284         {
3285             if ( src[n] > 0xFF )
3286                 return wxCONV_FAILED;
3287         }
3288     }
3289
3290     return srcLen;
3291 }
3292
3293 size_t wxCSConv::GetMBNulLen() const
3294 {
3295     CreateConvIfNeeded();
3296
3297     if ( m_convReal )
3298     {
3299         return m_convReal->GetMBNulLen();
3300     }
3301
3302     // otherwise, we are ISO-8859-1
3303     return 1;
3304 }
3305
3306 #if wxUSE_UNICODE_UTF8
3307 bool wxCSConv::IsUTF8() const
3308 {
3309     CreateConvIfNeeded();
3310
3311     if ( m_convReal )
3312     {
3313         return m_convReal->IsUTF8();
3314     }
3315
3316     // otherwise, we are ISO-8859-1
3317     return false;
3318 }
3319 #endif
3320
3321
3322 #if wxUSE_UNICODE
3323
3324 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3325 {
3326     if ( !s )
3327         return wxWCharBuffer();
3328
3329     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3330     if ( !wbuf )
3331         wbuf = wxMBConvUTF8().cMB2WX(s);
3332     if ( !wbuf )
3333         wbuf = wxConvISO8859_1.cMB2WX(s);
3334
3335     return wbuf;
3336 }
3337
3338 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3339 {
3340     if ( !ws )
3341         return wxCharBuffer();
3342
3343     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3344     if ( !buf )
3345         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3346
3347     return buf;
3348 }
3349
3350 #endif // wxUSE_UNICODE
3351
3352 // ----------------------------------------------------------------------------
3353 // globals
3354 // ----------------------------------------------------------------------------
3355
3356 // NB: The reason why we create converted objects in this convoluted way,
3357 //     using a factory function instead of global variable, is that they
3358 //     may be used at static initialization time (some of them are used by
3359 //     wxString ctors and there may be a global wxString object). In other
3360 //     words, possibly _before_ the converter global object would be
3361 //     initialized.
3362
3363 #undef wxConvLibc
3364 #undef wxConvUTF8
3365 #undef wxConvUTF7
3366 #undef wxConvLocal
3367 #undef wxConvISO8859_1
3368
3369 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3370     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3371     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3372     {                                                                   \
3373         static impl_klass name##Obj ctor_args;                          \
3374         return &name##Obj;                                              \
3375     }                                                                   \
3376     /* this ensures that all global converter objects are created */    \
3377     /* by the time static initialization is done, i.e. before any */    \
3378     /* thread is launched: */                                           \
3379     static klass* gs_##name##instance = wxGet_##name##Ptr()
3380
3381 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3382     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3383
3384 #ifdef __INTELC__
3385     // disable warning "variable 'xxx' was declared but never referenced"
3386     #pragma warning(disable: 177)
3387 #endif // Intel C++
3388
3389 #ifdef __WINDOWS__
3390     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3391 #elif 0 // defined(__WXOSX__)
3392     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc,  (wxFONTENCODING_UTF8));
3393 #else
3394     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3395 #endif
3396
3397 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3398 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3399 //     provokes an error message about "not enough macro parameters"; and we
3400 //     can't use "()" here as the name##Obj declaration would be parsed as a
3401 //     function declaration then, so use a semicolon and live with an extra
3402 //     empty statement (and hope that no compilers warns about this)
3403 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3404 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3405
3406 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3407 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3408
3409 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3410 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3411
3412 #ifdef __DARWIN__
3413 // The xnu kernel always communicates file paths in decomposed UTF-8.
3414 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3415 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3416 #endif
3417
3418 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3419 #ifdef __DARWIN__
3420                                     &wxConvMacUTF8DObj;
3421 #else // !__DARWIN__
3422                                     wxGet_wxConvLibcPtr();
3423 #endif // __DARWIN__/!__DARWIN__
3424
3425 #else // !wxUSE_WCHAR_T
3426
3427 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3428 // stand-ins in absence of wchar_t
3429 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3430                                 wxConvISO8859_1,
3431                                 wxConvLocal,
3432                                 wxConvUTF8;
3433
3434 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T