src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/osx/core/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existing ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168     //
 169     // moreover, some conversion classes simply can't implement ToWChar()
 170     // directly, the primary example is wxConvLibc: mbstowcs() only handles
 171     // NUL-terminated strings
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     // the idea of this code is straightforward: it converts a NUL-terminated
 213     // chunk of the string during each iteration and updates the output buffer
 214     // with the result
 215     //
 216     // all the complication come from the fact that this function, for
 217     // historical reasons, must behave in 2 subtly different ways when it's
 218     // called with a fixed number of characters and when it's called for the
 219     // entire NUL-terminated string: in the former case (srcEnd == NULL) we
 220     // must count all characters we convert, NUL or not; but in the latter we
 221     // do not count the trailing NUL -- but still count all the NULs inside the
 222     // string
 223     //
 224     // so for the (simple) former case we just always count the trailing NUL,
 225     // but for the latter we need to wait until we see if there is going to be
 226     // another loop iteration and only count it then
 227     for ( ;; )
 228     {
 229         // try to convert the current chunk
 230         size_t lenChunk = MB2WC(NULL, src, 0);
 231         if ( lenChunk == wxCONV_FAILED )
 232             return wxCONV_FAILED;
 233
 234         dstWritten += lenChunk;
 235         if ( !srcEnd )
 236             dstWritten++;
 237
 238         if ( !lenChunk )
 239         {
 240             // nothing left in the input string, conversion succeeded
 241             break;
 242         }
 243
 244         if ( dst )
 245         {
 246             if ( dstWritten > dstLen )
 247                 return wxCONV_FAILED;
 248
 249             // +1 is for trailing NUL
 250             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 251                 return wxCONV_FAILED;
 252
 253             dst += lenChunk;
 254             if ( !srcEnd )
 255                 dst++;
 256         }
 257
 258         if ( !srcEnd )
 259         {
 260             // we convert just one chunk in this case as this is the entire
 261             // string anyhow
 262             break;
 263         }
 264
 265         // advance the input pointer past the end of this chunk
 266         while ( NotAllNULs(src, nulLen) )
 267         {
 268             // notice that we must skip over multiple bytes here as we suppose
 269             // that if NUL takes 2 or 4 bytes, then all the other characters do
 270             // too and so if advanced by a single byte we might erroneously
 271             // detect sequences of NUL bytes in the middle of the input
 272             src += nulLen;
 273         }
 274
 275         src += nulLen; // skipping over its terminator as well
 276
 277         // note that ">=" (and not just "==") is needed here as the terminator
 278         // we skipped just above could be inside or just after the buffer
 279         // delimited by srcEnd
 280         if ( src >= srcEnd )
 281             break;
 282
 283         // if we got here then this wasn't the last chunk in this string and
 284         // hence we must count an extra char for L'\0' even when converting a
 285         // fixed number of characters
 286         if ( srcEnd )
 287         {
 288             dstWritten++;
 289             if ( dst )
 290                 dst++;
 291         }
 292     }
 293
 294     return dstWritten;
 295 }
 296
 297 size_t
 298 wxMBConv::FromWChar(char *dst, size_t dstLen,
 299                     const wchar_t *src, size_t srcLen) const
 300 {
 301     // the number of chars [which would be] written to dst [if it were not NULL]
 302     size_t dstWritten = 0;
 303
 304     // if we don't know its length we have no choice but to assume that it is
 305     // NUL-terminated (notice that it can still be NUL-terminated even if
 306     // explicit length is given but it doesn't change our return value)
 307     const bool isNulTerminated = srcLen == wxNO_LEN;
 308
 309     // make a copy of the input string unless it is already properly
 310     // NUL-terminated
 311     wxWCharBuffer bufTmp;
 312     if ( isNulTerminated )
 313     {
 314         srcLen = wxWcslen(src) + 1;
 315     }
 316     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 317     {
 318         // make a copy in order to properly NUL-terminate the string
 319         bufTmp = wxWCharBuffer(srcLen);
 320         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 321         src = bufTmp;
 322     }
 323
 324     const size_t lenNul = GetMBNulLen();
 325     for ( const wchar_t * const srcEnd = src + srcLen;
 326           src < srcEnd;
 327           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 328     {
 329         // try to convert the current chunk
 330         size_t lenChunk = WC2MB(NULL, src, 0);
 331
 332         if ( lenChunk == wxCONV_FAILED )
 333             return wxCONV_FAILED;
 334
 335         dstWritten += lenChunk;
 336         if ( isNulTerminated )
 337             dstWritten += lenNul;
 338
 339         if ( dst )
 340         {
 341             if ( dstWritten > dstLen )
 342                 return wxCONV_FAILED;
 343
 344             if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
 345                 return wxCONV_FAILED;
 346
 347             dst += lenChunk;
 348             if ( isNulTerminated )
 349                 dst += lenNul;
 350         }
 351     }
 352
 353     return dstWritten;
 354 }
 355
 356 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 357 {
 358     size_t rc = ToWChar(outBuff, outLen, inBuff);
 359     if ( rc != wxCONV_FAILED )
 360     {
 361         // ToWChar() returns the buffer length, i.e. including the trailing
 362         // NUL, while this method doesn't take it into account
 363         rc--;
 364     }
 365
 366     return rc;
 367 }
 368
 369 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 370 {
 371     size_t rc = FromWChar(outBuff, outLen, inBuff);
 372     if ( rc != wxCONV_FAILED )
 373     {
 374         rc -= GetMBNulLen();
 375     }
 376
 377     return rc;
 378 }
 379
 380 wxMBConv::~wxMBConv()
 381 {
 382     // nothing to do here (necessary for Darwin linking probably)
 383 }
 384
 385 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 386 {
 387     if ( psz )
 388     {
 389         // calculate the length of the buffer needed first
 390         const size_t nLen = ToWChar(NULL, 0, psz);
 391         if ( nLen != wxCONV_FAILED )
 392         {
 393             // now do the actual conversion
 394             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 395
 396             // +1 for the trailing NULL
 397             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 398                 return buf;
 399         }
 400     }
 401
 402     return wxWCharBuffer();
 403 }
 404
 405 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 406 {
 407     if ( pwz )
 408     {
 409         const size_t nLen = FromWChar(NULL, 0, pwz);
 410         if ( nLen != wxCONV_FAILED )
 411         {
 412             wxCharBuffer buf(nLen - 1);
 413             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 414                 return buf;
 415         }
 416     }
 417
 418     return wxCharBuffer();
 419 }
 420
 421 const wxWCharBuffer
 422 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 423 {
 424     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 425     if ( dstLen != wxCONV_FAILED )
 426     {
 427         // notice that we allocate space for dstLen+1 wide characters here
 428         // because we want the buffer to always be NUL-terminated, even if the
 429         // input isn't (as otherwise the caller has no way to know its length)
 430         wxWCharBuffer wbuf(dstLen);
 431         wbuf.data()[dstLen] = L'\0';
 432         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 433         {
 434             if ( outLen )
 435             {
 436                 *outLen = dstLen;
 437
 438                 // we also need to handle NUL-terminated input strings
 439                 // specially: for them the output is the length of the string
 440                 // excluding the trailing NUL, however if we're asked to
 441                 // convert a specific number of characters we return the length
 442                 // of the resulting output even if it's NUL-terminated
 443                 if ( inLen == wxNO_LEN )
 444                     (*outLen)--;
 445             }
 446
 447             return wbuf;
 448         }
 449     }
 450
 451     if ( outLen )
 452         *outLen = 0;
 453
 454     return wxWCharBuffer();
 455 }
 456
 457 const wxCharBuffer
 458 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 459 {
 460     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 461     if ( dstLen != wxCONV_FAILED )
 462     {
 463         const size_t nulLen = GetMBNulLen();
 464
 465         // as above, ensure that the buffer is always NUL-terminated, even if
 466         // the input is not
 467         wxCharBuffer buf(dstLen + nulLen - 1);
 468         memset(buf.data() + dstLen, 0, nulLen);
 469         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 470         {
 471             if ( outLen )
 472             {
 473                 *outLen = dstLen;
 474
 475                 if ( inLen == wxNO_LEN )
 476                 {
 477                     // in this case both input and output are NUL-terminated
 478                     // and we're not supposed to count NUL
 479                     *outLen -= nulLen;
 480                 }
 481             }
 482
 483             return buf;
 484         }
 485     }
 486
 487     if ( outLen )
 488         *outLen = 0;
 489
 490     return wxCharBuffer();
 491 }
 492
 493 // ----------------------------------------------------------------------------
 494 // wxMBConvLibc
 495 // ----------------------------------------------------------------------------
 496
 497 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 498 {
 499     return wxMB2WC(buf, psz, n);
 500 }
 501
 502 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 503 {
 504     return wxWC2MB(buf, psz, n);
 505 }
 506
 507 // ----------------------------------------------------------------------------
 508 // wxConvBrokenFileNames
 509 // ----------------------------------------------------------------------------
 510
 511 #ifdef __UNIX__
 512
 513 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 514 {
 515     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 516          wxStricmp(charset, _T("UTF8")) == 0  )
 517         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 518     else
 519         m_conv = new wxCSConv(charset);
 520 }
 521
 522 #endif // __UNIX__
 523
 524 // ----------------------------------------------------------------------------
 525 // UTF-7
 526 // ----------------------------------------------------------------------------
 527
 528 // Implementation (C) 2004 Fredrik Roubert
 529 //
 530 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 531
 532 //
 533 // BASE64 decoding table
 534 //
 535 static const unsigned char utf7unb64[] =
 536 {
 537     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 538     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 539     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 540     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 541     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 542     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 543     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 544     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 545     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 546     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 547     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 548     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 549     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 550     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 551     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 552     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 553     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 554     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 555     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 556     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 557     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 558     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 559     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 560     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 561     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 562     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 563     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 564     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 565     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 566     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 567     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 568     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 569 };
 570
 571 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 572                              const char *src, size_t srcLen) const
 573 {
 574     DecoderState stateOrig,
 575                 *statePtr;
 576     if ( srcLen == wxNO_LEN )
 577     {
 578         // convert the entire string, up to and including the trailing NUL
 579         srcLen = strlen(src) + 1;
 580
 581         // when working on the entire strings we don't update nor use the shift
 582         // state from the previous call
 583         statePtr = &stateOrig;
 584     }
 585     else // when working with partial strings we do use the shift state
 586     {
 587         statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
 588
 589         // also save the old state to be able to rollback to it on error
 590         stateOrig = m_stateDecoder;
 591     }
 592
 593     // but to simplify the code below we use this variable in both cases
 594     DecoderState& state = *statePtr;
 595
 596
 597     // number of characters [which would have been] written to dst [if it were
 598     // not NULL]
 599     size_t len = 0;
 600
 601     const char * const srcEnd = src + srcLen;
 602
 603     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 604     {
 605         const unsigned char cc = *src++;
 606
 607         if ( state.IsShifted() )
 608         {
 609             const unsigned char dc = utf7unb64[cc];
 610             if ( dc == 0xff )
 611             {
 612                 // end of encoded part, check that nothing was left: the bit
 613                 // field cycles through 0,6,4,2 sequence so check that we're at
 614                 // the end of it
 615                 if ( state.bit != 2 )
 616                     return wxCONV_FAILED;
 617
 618                 state.ToDirect();
 619
 620                 // re-parse this character normally below unless it's '-' which
 621                 // is consumed by the decoder
 622                 if ( cc == '-' )
 623                     continue;
 624             }
 625             else // valid encoded character
 626             {
 627                 // mini base64 decoder: each character is 6 bits
 628                 state.bit += 6;
 629                 state.accum <<= 6;
 630                 state.accum += dc;
 631
 632                 if ( state.bit >= 8 )
 633                 {
 634                     // got the full byte, consume it
 635                     state.bit -= 8;
 636                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 637
 638                     if ( state.isLSB )
 639                     {
 640                         // we've got the full word, output it
 641                         if ( dst )
 642                             *dst++ = (state.msb << 8) | b;
 643                         len++;
 644                         state.isLSB = false;
 645                     }
 646                     else // MSB
 647                     {
 648                         // just store it while we wait for LSB
 649                         state.msb = b;
 650                         state.isLSB = true;
 651                     }
 652                 }
 653             }
 654         }
 655
 656         if ( state.IsDirect() )
 657         {
 658             // start of an encoded segment?
 659             if ( cc == '+' )
 660             {
 661                 if ( *src == '-' )
 662                 {
 663                     // just the encoded plus sign, don't switch to shifted mode
 664                     if ( dst )
 665                         *dst++ = '+';
 666                     len++;
 667                     src++;
 668                 }
 669                 else
 670                 {
 671                     state.ToShifted();
 672                 }
 673             }
 674             else // not '+'
 675             {
 676                 // only printable 7 bit ASCII characters (with the exception of
 677                 // NUL, TAB, CR and LF) can be used directly
 678                 if ( cc >= 0x7f || (cc < ' ' &&
 679                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 680                     return wxCONV_FAILED;
 681
 682                 if ( dst )
 683                     *dst++ = cc;
 684                 len++;
 685             }
 686         }
 687     }
 688
 689     if ( !len )
 690     {
 691         // as we didn't read any characters we should be called with the same
 692         // data (followed by some more new data) again later so don't save our
 693         // state
 694         state = stateOrig;
 695
 696         return wxCONV_FAILED;
 697     }
 698
 699     return len;
 700 }
 701
 702 //
 703 // BASE64 encoding table
 704 //
 705 static const unsigned char utf7enb64[] =
 706 {
 707     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 708     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 709     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 710     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 711     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 712     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 713     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 714     '4', '5', '6', '7', '8', '9', '+', '/'
 715 };
 716
 717 //
 718 // UTF-7 encoding table
 719 //
 720 // 0 - Set D (directly encoded characters)
 721 // 1 - Set O (optional direct characters)
 722 // 2 - whitespace characters (optional)
 723 // 3 - special characters
 724 //
 725 static const unsigned char utf7encode[128] =
 726 {
 727     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 728     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 729     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 730     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 731     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 732     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 733     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 734     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 735 };
 736
 737 static inline bool wxIsUTF7Direct(wchar_t wc)
 738 {
 739     return wc < 0x80 && utf7encode[wc] < 1;
 740 }
 741
 742 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 743                                const wchar_t *src, size_t srcLen) const
 744 {
 745     EncoderState stateOrig,
 746                 *statePtr;
 747     if ( srcLen == wxNO_LEN )
 748     {
 749         // we don't apply the stored state when operating on entire strings at
 750         // once
 751         statePtr = &stateOrig;
 752
 753         srcLen = wxWcslen(src) + 1;
 754     }
 755     else // do use the mode we left the output in previously
 756     {
 757         stateOrig = m_stateEncoder;
 758         statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
 759     }
 760
 761     EncoderState& state = *statePtr;
 762
 763
 764     size_t len = 0;
 765
 766     const wchar_t * const srcEnd = src + srcLen;
 767     while ( src < srcEnd && (!dst || len < dstLen) )
 768     {
 769         wchar_t cc = *src++;
 770         if ( wxIsUTF7Direct(cc) )
 771         {
 772             if ( state.IsShifted() )
 773             {
 774                 // pad with zeros the last encoded block if necessary
 775                 if ( state.bit )
 776                 {
 777                     if ( dst )
 778                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 779                     len++;
 780                 }
 781
 782                 state.ToDirect();
 783
 784                 if ( dst )
 785                     *dst++ = '-';
 786                 len++;
 787             }
 788
 789             if ( dst )
 790                 *dst++ = (char)cc;
 791             len++;
 792         }
 793         else if ( cc == '+' && state.IsDirect() )
 794         {
 795             if ( dst )
 796             {
 797                 *dst++ = '+';
 798                 *dst++ = '-';
 799             }
 800
 801             len += 2;
 802         }
 803 #ifndef WC_UTF16
 804         else if (((wxUint32)cc) > 0xffff)
 805         {
 806             // no surrogate pair generation (yet?)
 807             return wxCONV_FAILED;
 808         }
 809 #endif
 810         else
 811         {
 812             if ( state.IsDirect() )
 813             {
 814                 state.ToShifted();
 815
 816                 if ( dst )
 817                     *dst++ = '+';
 818                 len++;
 819             }
 820
 821             // BASE64 encode string
 822             for ( ;; )
 823             {
 824                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 825                 {
 826                     state.accum <<= 8;
 827                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 828
 829                     for (state.bit += 8; state.bit >= 6; )
 830                     {
 831                         state.bit -= 6;
 832                         if ( dst )
 833                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 834                         len++;
 835                     }
 836                 }
 837
 838                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 839                     break;
 840
 841                 src++;
 842             }
 843         }
 844     }
 845
 846     // we need to restore the original encoder state if we were called just to
 847     // calculate the amount of space needed as we will presumably be called
 848     // again to really convert the data now
 849     if ( !dst )
 850         state = stateOrig;
 851
 852     return len;
 853 }
 854
 855 // ----------------------------------------------------------------------------
 856 // UTF-8
 857 // ----------------------------------------------------------------------------
 858
 859 static const wxUint32 utf8_max[]=
 860     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 861
 862 // boundaries of the private use area we use to (temporarily) remap invalid
 863 // characters invalid in a UTF-8 encoded string
 864 const wxUint32 wxUnicodePUA = 0x100000;
 865 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 866
 867 // this table gives the length of the UTF-8 encoding from its first character:
 868 const unsigned char tableUtf8Lengths[256] = {
 869     // single-byte sequences (ASCII):
 870     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 871     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 872     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 873     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 874     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 875     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 876     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 877     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 878
 879     // these are invalid:
 880     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 881     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 882     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 883     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 884     0, 0,                                            // C0,C1
 885
 886     // two-byte sequences:
 887           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 888     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 889
 890     // three-byte sequences:
 891     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 892
 893     // four-byte sequences:
 894     4, 4, 4, 4, 4,                                   // F0..F4
 895
 896     // these are invalid again (5- or 6-byte
 897     // sequences and sequences for code points
 898     // above U+10FFFF, as restricted by RFC 3629):
 899                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 900 };
 901
 902 size_t
 903 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 904                             const char *src, size_t srcLen) const
 905 {
 906     wchar_t *out = dstLen ? dst : NULL;
 907     size_t written = 0;
 908
 909     if ( srcLen == wxNO_LEN )
 910         srcLen = strlen(src) + 1;
 911
 912     for ( const char *p = src; ; p++ )
 913     {
 914         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 915         {
 916             // all done successfully, just add the trailing NULL if we are not
 917             // using explicit length
 918             if ( srcLen == wxNO_LEN )
 919             {
 920                 if ( out )
 921                 {
 922                     if ( !dstLen )
 923                         break;
 924
 925                     *out = L'\0';
 926                 }
 927
 928                 written++;
 929             }
 930
 931             return written;
 932         }
 933
 934         if ( out && !dstLen-- )
 935             break;
 936
 937         wxUint32 code;
 938         unsigned char c = *p;
 939
 940         if ( c < 0x80 )
 941         {
 942             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 943                 break;
 944
 945             if ( srcLen != wxNO_LEN )
 946                 srcLen--;
 947
 948             code = c;
 949         }
 950         else
 951         {
 952             unsigned len = tableUtf8Lengths[c];
 953             if ( !len )
 954                 break;
 955
 956             if ( srcLen < len ) // the test works for wxNO_LEN too
 957                 break;
 958
 959             if ( srcLen != wxNO_LEN )
 960                 srcLen -= len;
 961
 962             //   Char. number range   |        UTF-8 octet sequence
 963             //      (hexadecimal)     |              (binary)
 964             //  ----------------------+----------------------------------------
 965             //  0000 0000 - 0000 007F | 0xxxxxxx
 966             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 967             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 968             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 969             //
 970             //  Code point value is stored in bits marked with 'x',
 971             //  lowest-order bit of the value on the right side in the diagram
 972             //  above.                                         (from RFC 3629)
 973
 974             // mask to extract lead byte's value ('x' bits above), by sequence
 975             // length:
 976             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 977
 978             // mask and value of lead byte's most significant bits, by length:
 979             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 980             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 981
 982             len--; // it's more convenient to work with 0-based length here
 983
 984             // extract the lead byte's value bits:
 985             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
 986                 break;
 987
 988             code = c & leadValueMask[len];
 989
 990             // all remaining bytes, if any, are handled in the same way
 991             // regardless of sequence's length:
 992             for ( ; len; --len )
 993             {
 994                 c = *++p;
 995                 if ( (c & 0xC0) != 0x80 )
 996                     return wxCONV_FAILED;
 997
 998                 code <<= 6;
 999                 code |= c & 0x3F;
1000             }
1001         }
1002
1003 #ifdef WC_UTF16
1004         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1005         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1006         {
1007             if ( out )
1008                 out++;
1009             written++;
1010         }
1011 #else // !WC_UTF16
1012         if ( out )
1013             *out = code;
1014 #endif // WC_UTF16/!WC_UTF16
1015
1016         if ( out )
1017             out++;
1018
1019         written++;
1020     }
1021
1022     return wxCONV_FAILED;
1023 }
1024
1025 size_t
1026 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1027                               const wchar_t *src, size_t srcLen) const
1028 {
1029     char *out = dstLen ? dst : NULL;
1030     size_t written = 0;
1031
1032     for ( const wchar_t *wp = src; ; wp++ )
1033     {
1034         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1035         {
1036             // all done successfully, just add the trailing NULL if we are not
1037             // using explicit length
1038             if ( srcLen == wxNO_LEN )
1039             {
1040                 if ( out )
1041                 {
1042                     if ( !dstLen )
1043                         break;
1044
1045                     *out = '\0';
1046                 }
1047
1048                 written++;
1049             }
1050
1051             return written;
1052         }
1053
1054         if ( srcLen != wxNO_LEN )
1055             srcLen--;
1056
1057         wxUint32 code;
1058 #ifdef WC_UTF16
1059         // cast is ok for WC_UTF16
1060         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1061         {
1062             // skip the next char too as we decoded a surrogate
1063             wp++;
1064         }
1065 #else // wchar_t is UTF-32
1066         code = *wp & 0x7fffffff;
1067 #endif
1068
1069         unsigned len;
1070         if ( code <= 0x7F )
1071         {
1072             len = 1;
1073             if ( out )
1074             {
1075                 if ( dstLen < len )
1076                     break;
1077
1078                 out[0] = (char)code;
1079             }
1080         }
1081         else if ( code <= 0x07FF )
1082         {
1083             len = 2;
1084             if ( out )
1085             {
1086                 if ( dstLen < len )
1087                     break;
1088
1089                 // NB: this line takes 6 least significant bits, encodes them as
1090                 // 10xxxxxx and discards them so that the next byte can be encoded:
1091                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1092                 out[0] = 0xC0 | code;
1093             }
1094         }
1095         else if ( code < 0xFFFF )
1096         {
1097             len = 3;
1098             if ( out )
1099             {
1100                 if ( dstLen < len )
1101                     break;
1102
1103                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1104                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1105                 out[0] = 0xE0 | code;
1106             }
1107         }
1108         else if ( code <= 0x10FFFF )
1109         {
1110             len = 4;
1111             if ( out )
1112             {
1113                 if ( dstLen < len )
1114                     break;
1115
1116                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1117                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1118                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1119                 out[0] = 0xF0 | code;
1120             }
1121         }
1122         else
1123         {
1124             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1125             break;
1126         }
1127
1128         if ( out )
1129         {
1130             out += len;
1131             dstLen -= len;
1132         }
1133
1134         written += len;
1135     }
1136
1137     // we only get here if an error occurs during decoding
1138     return wxCONV_FAILED;
1139 }
1140
1141 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1142                              const char *psz, size_t srcLen) const
1143 {
1144     if ( m_options == MAP_INVALID_UTF8_NOT )
1145         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1146
1147     size_t len = 0;
1148
1149     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1150     {
1151         const char *opsz = psz;
1152         bool invalid = false;
1153         unsigned char cc = *psz++, fc = cc;
1154         unsigned cnt;
1155         for (cnt = 0; fc & 0x80; cnt++)
1156             fc <<= 1;
1157
1158         if (!cnt)
1159         {
1160             // plain ASCII char
1161             if (buf)
1162                 *buf++ = cc;
1163             len++;
1164
1165             // escape the escape character for octal escapes
1166             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1167                     && cc == '\\' && (!buf || len < n))
1168             {
1169                 if (buf)
1170                     *buf++ = cc;
1171                 len++;
1172             }
1173         }
1174         else
1175         {
1176             cnt--;
1177             if (!cnt)
1178             {
1179                 // invalid UTF-8 sequence
1180                 invalid = true;
1181             }
1182             else
1183             {
1184                 unsigned ocnt = cnt - 1;
1185                 wxUint32 res = cc & (0x3f >> cnt);
1186                 while (cnt--)
1187                 {
1188                     cc = *psz;
1189                     if ((cc & 0xC0) != 0x80)
1190                     {
1191                         // invalid UTF-8 sequence
1192                         invalid = true;
1193                         break;
1194                     }
1195
1196                     psz++;
1197                     res = (res << 6) | (cc & 0x3f);
1198                 }
1199
1200                 if (invalid || res <= utf8_max[ocnt])
1201                 {
1202                     // illegal UTF-8 encoding
1203                     invalid = true;
1204                 }
1205                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1206                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1207                 {
1208                     // if one of our PUA characters turns up externally
1209                     // it must also be treated as an illegal sequence
1210                     // (a bit like you have to escape an escape character)
1211                     invalid = true;
1212                 }
1213                 else
1214                 {
1215 #ifdef WC_UTF16
1216                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1217                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1218                     if (pa == wxCONV_FAILED)
1219                     {
1220                         invalid = true;
1221                     }
1222                     else
1223                     {
1224                         if (buf)
1225                             buf += pa;
1226                         len += pa;
1227                     }
1228 #else // !WC_UTF16
1229                     if (buf)
1230                         *buf++ = (wchar_t)res;
1231                     len++;
1232 #endif // WC_UTF16/!WC_UTF16
1233                 }
1234             }
1235
1236             if (invalid)
1237             {
1238                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1239                 {
1240                     while (opsz < psz && (!buf || len < n))
1241                     {
1242 #ifdef WC_UTF16
1243                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1244                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1245                         wxASSERT(pa != wxCONV_FAILED);
1246                         if (buf)
1247                             buf += pa;
1248                         opsz++;
1249                         len += pa;
1250 #else
1251                         if (buf)
1252                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1253                         opsz++;
1254                         len++;
1255 #endif
1256                     }
1257                 }
1258                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1259                 {
1260                     while (opsz < psz && (!buf || len < n))
1261                     {
1262                         if ( buf && len + 3 < n )
1263                         {
1264                             unsigned char on = *opsz;
1265                             *buf++ = L'\\';
1266                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1267                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1268                             *buf++ = (wchar_t)( L'0' + on % 010 );
1269                         }
1270
1271                         opsz++;
1272                         len += 4;
1273                     }
1274                 }
1275                 else // MAP_INVALID_UTF8_NOT
1276                 {
1277                     return wxCONV_FAILED;
1278                 }
1279             }
1280         }
1281     }
1282
1283     if (srcLen == wxNO_LEN && buf && (len < n))
1284         *buf = 0;
1285
1286     return len + 1;
1287 }
1288
1289 static inline bool isoctal(wchar_t wch)
1290 {
1291     return L'0' <= wch && wch <= L'7';
1292 }
1293
1294 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1295                                const wchar_t *psz, size_t srcLen) const
1296 {
1297     if ( m_options == MAP_INVALID_UTF8_NOT )
1298         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1299
1300     size_t len = 0;
1301
1302     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1303     {
1304         wxUint32 cc;
1305
1306 #ifdef WC_UTF16
1307         // cast is ok for WC_UTF16
1308         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1309         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1310 #else
1311         cc = (*psz++) & 0x7fffffff;
1312 #endif
1313
1314         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1315                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1316         {
1317             if (buf)
1318                 *buf++ = (char)(cc - wxUnicodePUA);
1319             len++;
1320         }
1321         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1322                     && cc == L'\\' && psz[0] == L'\\' )
1323         {
1324             if (buf)
1325                 *buf++ = (char)cc;
1326             psz++;
1327             len++;
1328         }
1329         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1330                     cc == L'\\' &&
1331                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1332         {
1333             if (buf)
1334             {
1335                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1336                                  (psz[1] - L'0') * 010 +
1337                                  (psz[2] - L'0'));
1338             }
1339
1340             psz += 3;
1341             len++;
1342         }
1343         else
1344         {
1345             unsigned cnt;
1346             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1347             {
1348             }
1349
1350             if (!cnt)
1351             {
1352                 // plain ASCII char
1353                 if (buf)
1354                     *buf++ = (char) cc;
1355                 len++;
1356             }
1357             else
1358             {
1359                 len += cnt + 1;
1360                 if (buf)
1361                 {
1362                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1363                     while (cnt--)
1364                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1365                 }
1366             }
1367         }
1368     }
1369
1370     if (srcLen == wxNO_LEN && buf && (len < n))
1371         *buf = 0;
1372
1373     return len + 1;
1374 }
1375
1376 // ============================================================================
1377 // UTF-16
1378 // ============================================================================
1379
1380 #ifdef WORDS_BIGENDIAN
1381     #define wxMBConvUTF16straight wxMBConvUTF16BE
1382     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1383 #else
1384     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1385     #define wxMBConvUTF16straight wxMBConvUTF16LE
1386 #endif
1387
1388 /* static */
1389 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1390 {
1391     if ( srcLen == wxNO_LEN )
1392     {
1393         // count the number of bytes in input, including the trailing NULs
1394         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1395         for ( srcLen = 1; *inBuff++; srcLen++ )
1396             ;
1397
1398         srcLen *= BYTES_PER_CHAR;
1399     }
1400     else // we already have the length
1401     {
1402         // we can only convert an entire number of UTF-16 characters
1403         if ( srcLen % BYTES_PER_CHAR )
1404             return wxCONV_FAILED;
1405     }
1406
1407     return srcLen;
1408 }
1409
1410 // case when in-memory representation is UTF-16 too
1411 #ifdef WC_UTF16
1412
1413 // ----------------------------------------------------------------------------
1414 // conversions without endianness change
1415 // ----------------------------------------------------------------------------
1416
1417 size_t
1418 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1419                                const char *src, size_t srcLen) const
1420 {
1421     // set up the scene for using memcpy() (which is presumably more efficient
1422     // than copying the bytes one by one)
1423     srcLen = GetLength(src, srcLen);
1424     if ( srcLen == wxNO_LEN )
1425         return wxCONV_FAILED;
1426
1427     const size_t inLen = srcLen / BYTES_PER_CHAR;
1428     if ( dst )
1429     {
1430         if ( dstLen < inLen )
1431             return wxCONV_FAILED;
1432
1433         memcpy(dst, src, srcLen);
1434     }
1435
1436     return inLen;
1437 }
1438
1439 size_t
1440 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1441                                  const wchar_t *src, size_t srcLen) const
1442 {
1443     if ( srcLen == wxNO_LEN )
1444         srcLen = wxWcslen(src) + 1;
1445
1446     srcLen *= BYTES_PER_CHAR;
1447
1448     if ( dst )
1449     {
1450         if ( dstLen < srcLen )
1451             return wxCONV_FAILED;
1452
1453         memcpy(dst, src, srcLen);
1454     }
1455
1456     return srcLen;
1457 }
1458
1459 // ----------------------------------------------------------------------------
1460 // endian-reversing conversions
1461 // ----------------------------------------------------------------------------
1462
1463 size_t
1464 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1465                            const char *src, size_t srcLen) const
1466 {
1467     srcLen = GetLength(src, srcLen);
1468     if ( srcLen == wxNO_LEN )
1469         return wxCONV_FAILED;
1470
1471     srcLen /= BYTES_PER_CHAR;
1472
1473     if ( dst )
1474     {
1475         if ( dstLen < srcLen )
1476             return wxCONV_FAILED;
1477
1478         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1479         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1480         {
1481             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1482         }
1483     }
1484
1485     return srcLen;
1486 }
1487
1488 size_t
1489 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1490                              const wchar_t *src, size_t srcLen) const
1491 {
1492     if ( srcLen == wxNO_LEN )
1493         srcLen = wxWcslen(src) + 1;
1494
1495     srcLen *= BYTES_PER_CHAR;
1496
1497     if ( dst )
1498     {
1499         if ( dstLen < srcLen )
1500             return wxCONV_FAILED;
1501
1502         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1503         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1504         {
1505             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1506         }
1507     }
1508
1509     return srcLen;
1510 }
1511
1512 #else // !WC_UTF16: wchar_t is UTF-32
1513
1514 // ----------------------------------------------------------------------------
1515 // conversions without endianness change
1516 // ----------------------------------------------------------------------------
1517
1518 size_t
1519 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1520                                const char *src, size_t srcLen) const
1521 {
1522     srcLen = GetLength(src, srcLen);
1523     if ( srcLen == wxNO_LEN )
1524         return wxCONV_FAILED;
1525
1526     const size_t inLen = srcLen / BYTES_PER_CHAR;
1527     if ( !dst )
1528     {
1529         // optimization: return maximal space which could be needed for this
1530         // string even if the real size could be smaller if the buffer contains
1531         // any surrogates
1532         return inLen;
1533     }
1534
1535     size_t outLen = 0;
1536     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1537     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1538     {
1539         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1540         if ( !inBuff )
1541             return wxCONV_FAILED;
1542
1543         if ( ++outLen > dstLen )
1544             return wxCONV_FAILED;
1545
1546         *dst++ = ch;
1547     }
1548
1549
1550     return outLen;
1551 }
1552
1553 size_t
1554 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1555                                  const wchar_t *src, size_t srcLen) const
1556 {
1557     if ( srcLen == wxNO_LEN )
1558         srcLen = wxWcslen(src) + 1;
1559
1560     size_t outLen = 0;
1561     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1562     for ( size_t n = 0; n < srcLen; n++ )
1563     {
1564         wxUint16 cc[2];
1565         const size_t numChars = encode_utf16(*src++, cc);
1566         if ( numChars == wxCONV_FAILED )
1567             return wxCONV_FAILED;
1568
1569         outLen += numChars * BYTES_PER_CHAR;
1570         if ( outBuff )
1571         {
1572             if ( outLen > dstLen )
1573                 return wxCONV_FAILED;
1574
1575             *outBuff++ = cc[0];
1576             if ( numChars == 2 )
1577             {
1578                 // second character of a surrogate
1579                 *outBuff++ = cc[1];
1580             }
1581         }
1582     }
1583
1584     return outLen;
1585 }
1586
1587 // ----------------------------------------------------------------------------
1588 // endian-reversing conversions
1589 // ----------------------------------------------------------------------------
1590
1591 size_t
1592 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1593                            const char *src, size_t srcLen) const
1594 {
1595     srcLen = GetLength(src, srcLen);
1596     if ( srcLen == wxNO_LEN )
1597         return wxCONV_FAILED;
1598
1599     const size_t inLen = srcLen / BYTES_PER_CHAR;
1600     if ( !dst )
1601     {
1602         // optimization: return maximal space which could be needed for this
1603         // string even if the real size could be smaller if the buffer contains
1604         // any surrogates
1605         return inLen;
1606     }
1607
1608     size_t outLen = 0;
1609     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1610     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1611     {
1612         wxUint32 ch;
1613         wxUint16 tmp[2];
1614
1615         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1616         inBuff++;
1617         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1618
1619         const size_t numChars = decode_utf16(tmp, ch);
1620         if ( numChars == wxCONV_FAILED )
1621             return wxCONV_FAILED;
1622
1623         if ( numChars == 2 )
1624             inBuff++;
1625
1626         if ( ++outLen > dstLen )
1627             return wxCONV_FAILED;
1628
1629         *dst++ = ch;
1630     }
1631
1632
1633     return outLen;
1634 }
1635
1636 size_t
1637 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1638                              const wchar_t *src, size_t srcLen) const
1639 {
1640     if ( srcLen == wxNO_LEN )
1641         srcLen = wxWcslen(src) + 1;
1642
1643     size_t outLen = 0;
1644     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1645     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1646     {
1647         wxUint16 cc[2];
1648         const size_t numChars = encode_utf16(*src, cc);
1649         if ( numChars == wxCONV_FAILED )
1650             return wxCONV_FAILED;
1651
1652         outLen += numChars * BYTES_PER_CHAR;
1653         if ( outBuff )
1654         {
1655             if ( outLen > dstLen )
1656                 return wxCONV_FAILED;
1657
1658             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1659             if ( numChars == 2 )
1660             {
1661                 // second character of a surrogate
1662                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1663             }
1664         }
1665     }
1666
1667     return outLen;
1668 }
1669
1670 #endif // WC_UTF16/!WC_UTF16
1671
1672
1673 // ============================================================================
1674 // UTF-32
1675 // ============================================================================
1676
1677 #ifdef WORDS_BIGENDIAN
1678     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1679     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1680 #else
1681     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1682     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1683 #endif
1684
1685
1686 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1687 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1688
1689 /* static */
1690 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1691 {
1692     if ( srcLen == wxNO_LEN )
1693     {
1694         // count the number of bytes in input, including the trailing NULs
1695         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1696         for ( srcLen = 1; *inBuff++; srcLen++ )
1697             ;
1698
1699         srcLen *= BYTES_PER_CHAR;
1700     }
1701     else // we already have the length
1702     {
1703         // we can only convert an entire number of UTF-32 characters
1704         if ( srcLen % BYTES_PER_CHAR )
1705             return wxCONV_FAILED;
1706     }
1707
1708     return srcLen;
1709 }
1710
1711 // case when in-memory representation is UTF-16
1712 #ifdef WC_UTF16
1713
1714 // ----------------------------------------------------------------------------
1715 // conversions without endianness change
1716 // ----------------------------------------------------------------------------
1717
1718 size_t
1719 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1720                                const char *src, size_t srcLen) const
1721 {
1722     srcLen = GetLength(src, srcLen);
1723     if ( srcLen == wxNO_LEN )
1724         return wxCONV_FAILED;
1725
1726     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1727     const size_t inLen = srcLen / BYTES_PER_CHAR;
1728     size_t outLen = 0;
1729     for ( size_t n = 0; n < inLen; n++ )
1730     {
1731         wxUint16 cc[2];
1732         const size_t numChars = encode_utf16(*inBuff++, cc);
1733         if ( numChars == wxCONV_FAILED )
1734             return wxCONV_FAILED;
1735
1736         outLen += numChars;
1737         if ( dst )
1738         {
1739             if ( outLen > dstLen )
1740                 return wxCONV_FAILED;
1741
1742             *dst++ = cc[0];
1743             if ( numChars == 2 )
1744             {
1745                 // second character of a surrogate
1746                 *dst++ = cc[1];
1747             }
1748         }
1749     }
1750
1751     return outLen;
1752 }
1753
1754 size_t
1755 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1756                                  const wchar_t *src, size_t srcLen) const
1757 {
1758     if ( srcLen == wxNO_LEN )
1759         srcLen = wxWcslen(src) + 1;
1760
1761     if ( !dst )
1762     {
1763         // optimization: return maximal space which could be needed for this
1764         // string instead of the exact amount which could be less if there are
1765         // any surrogates in the input
1766         //
1767         // we consider that surrogates are rare enough to make it worthwhile to
1768         // avoid running the loop below at the cost of slightly extra memory
1769         // consumption
1770         return srcLen * BYTES_PER_CHAR;
1771     }
1772
1773     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1774     size_t outLen = 0;
1775     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1776     {
1777         const wxUint32 ch = wxDecodeSurrogate(&src);
1778         if ( !src )
1779             return wxCONV_FAILED;
1780
1781         outLen += BYTES_PER_CHAR;
1782
1783         if ( outLen > dstLen )
1784             return wxCONV_FAILED;
1785
1786         *outBuff++ = ch;
1787     }
1788
1789     return outLen;
1790 }
1791
1792 // ----------------------------------------------------------------------------
1793 // endian-reversing conversions
1794 // ----------------------------------------------------------------------------
1795
1796 size_t
1797 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1798                            const char *src, size_t srcLen) const
1799 {
1800     srcLen = GetLength(src, srcLen);
1801     if ( srcLen == wxNO_LEN )
1802         return wxCONV_FAILED;
1803
1804     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1805     const size_t inLen = srcLen / BYTES_PER_CHAR;
1806     size_t outLen = 0;
1807     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1808     {
1809         wxUint16 cc[2];
1810         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1811         if ( numChars == wxCONV_FAILED )
1812             return wxCONV_FAILED;
1813
1814         outLen += numChars;
1815         if ( dst )
1816         {
1817             if ( outLen > dstLen )
1818                 return wxCONV_FAILED;
1819
1820             *dst++ = cc[0];
1821             if ( numChars == 2 )
1822             {
1823                 // second character of a surrogate
1824                 *dst++ = cc[1];
1825             }
1826         }
1827     }
1828
1829     return outLen;
1830 }
1831
1832 size_t
1833 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1834                              const wchar_t *src, size_t srcLen) const
1835 {
1836     if ( srcLen == wxNO_LEN )
1837         srcLen = wxWcslen(src) + 1;
1838
1839     if ( !dst )
1840     {
1841         // optimization: return maximal space which could be needed for this
1842         // string instead of the exact amount which could be less if there are
1843         // any surrogates in the input
1844         //
1845         // we consider that surrogates are rare enough to make it worthwhile to
1846         // avoid running the loop below at the cost of slightly extra memory
1847         // consumption
1848         return srcLen*BYTES_PER_CHAR;
1849     }
1850
1851     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1852     size_t outLen = 0;
1853     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1854     {
1855         const wxUint32 ch = wxDecodeSurrogate(&src);
1856         if ( !src )
1857             return wxCONV_FAILED;
1858
1859         outLen += BYTES_PER_CHAR;
1860
1861         if ( outLen > dstLen )
1862             return wxCONV_FAILED;
1863
1864         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1865     }
1866
1867     return outLen;
1868 }
1869
1870 #else // !WC_UTF16: wchar_t is UTF-32
1871
1872 // ----------------------------------------------------------------------------
1873 // conversions without endianness change
1874 // ----------------------------------------------------------------------------
1875
1876 size_t
1877 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1878                                const char *src, size_t srcLen) const
1879 {
1880     // use memcpy() as it should be much faster than hand-written loop
1881     srcLen = GetLength(src, srcLen);
1882     if ( srcLen == wxNO_LEN )
1883         return wxCONV_FAILED;
1884
1885     const size_t inLen = srcLen/BYTES_PER_CHAR;
1886     if ( dst )
1887     {
1888         if ( dstLen < inLen )
1889             return wxCONV_FAILED;
1890
1891         memcpy(dst, src, srcLen);
1892     }
1893
1894     return inLen;
1895 }
1896
1897 size_t
1898 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1899                                  const wchar_t *src, size_t srcLen) const
1900 {
1901     if ( srcLen == wxNO_LEN )
1902         srcLen = wxWcslen(src) + 1;
1903
1904     srcLen *= BYTES_PER_CHAR;
1905
1906     if ( dst )
1907     {
1908         if ( dstLen < srcLen )
1909             return wxCONV_FAILED;
1910
1911         memcpy(dst, src, srcLen);
1912     }
1913
1914     return srcLen;
1915 }
1916
1917 // ----------------------------------------------------------------------------
1918 // endian-reversing conversions
1919 // ----------------------------------------------------------------------------
1920
1921 size_t
1922 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1923                            const char *src, size_t srcLen) const
1924 {
1925     srcLen = GetLength(src, srcLen);
1926     if ( srcLen == wxNO_LEN )
1927         return wxCONV_FAILED;
1928
1929     srcLen /= BYTES_PER_CHAR;
1930
1931     if ( dst )
1932     {
1933         if ( dstLen < srcLen )
1934             return wxCONV_FAILED;
1935
1936         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1937         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1938         {
1939             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1940         }
1941     }
1942
1943     return srcLen;
1944 }
1945
1946 size_t
1947 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1948                              const wchar_t *src, size_t srcLen) const
1949 {
1950     if ( srcLen == wxNO_LEN )
1951         srcLen = wxWcslen(src) + 1;
1952
1953     srcLen *= BYTES_PER_CHAR;
1954
1955     if ( dst )
1956     {
1957         if ( dstLen < srcLen )
1958             return wxCONV_FAILED;
1959
1960         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1961         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1962         {
1963             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1964         }
1965     }
1966
1967     return srcLen;
1968 }
1969
1970 #endif // WC_UTF16/!WC_UTF16
1971
1972
1973 // ============================================================================
1974 // The classes doing conversion using the iconv_xxx() functions
1975 // ============================================================================
1976
1977 #ifdef HAVE_ICONV
1978
1979 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1980 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1981 //     (unless there's yet another bug in glibc) the only case when iconv()
1982 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1983 //     left in the input buffer -- when _real_ error occurs,
1984 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1985 //     iconv() failure.
1986 //     [This bug does not appear in glibc 2.2.]
1987 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1988 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1989                                      (errno != E2BIG || bufLeft != 0))
1990 #else
1991 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1992 #endif
1993
1994 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1995
1996 #define ICONV_T_INVALID ((iconv_t)-1)
1997
1998 #if SIZEOF_WCHAR_T == 4
1999     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2000     #define WC_ENC      wxFONTENCODING_UTF32
2001 #elif SIZEOF_WCHAR_T == 2
2002     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2003     #define WC_ENC      wxFONTENCODING_UTF16
2004 #else // sizeof(wchar_t) != 2 nor 4
2005     // does this ever happen?
2006     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2007 #endif
2008
2009 // ----------------------------------------------------------------------------
2010 // wxMBConv_iconv: encapsulates an iconv character set
2011 // ----------------------------------------------------------------------------
2012
2013 class wxMBConv_iconv : public wxMBConv
2014 {
2015 public:
2016     wxMBConv_iconv(const char *name);
2017     virtual ~wxMBConv_iconv();
2018
2019     // implement base class virtual methods
2020     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2021                            const char *src, size_t srcLen = wxNO_LEN) const;
2022     virtual size_t FromWChar(char *dst, size_t dstLen,
2023                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2024     virtual size_t GetMBNulLen() const;
2025
2026 #if wxUSE_UNICODE_UTF8
2027     virtual bool IsUTF8() const;
2028 #endif
2029
2030     virtual wxMBConv *Clone() const
2031     {
2032         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2033         p->m_minMBCharWidth = m_minMBCharWidth;
2034         return p;
2035     }
2036
2037     bool IsOk() const
2038         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2039
2040 protected:
2041     // the iconv handlers used to translate from multibyte
2042     // to wide char and in the other direction
2043     iconv_t m2w,
2044             w2m;
2045
2046 #if wxUSE_THREADS
2047     // guards access to m2w and w2m objects
2048     wxMutex m_iconvMutex;
2049 #endif
2050
2051 private:
2052     // the name (for iconv_open()) of a wide char charset -- if none is
2053     // available on this machine, it will remain NULL
2054     static wxString ms_wcCharsetName;
2055
2056     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2057     // different endian-ness than the native one
2058     static bool ms_wcNeedsSwap;
2059
2060
2061     // name of the encoding handled by this conversion
2062     wxString m_name;
2063
2064     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2065     // initially
2066     size_t m_minMBCharWidth;
2067 };
2068
2069 // make the constructor available for unit testing
2070 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2071 {
2072     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2073     if ( !result->IsOk() )
2074     {
2075         delete result;
2076         return 0;
2077     }
2078
2079     return result;
2080 }
2081
2082 wxString wxMBConv_iconv::ms_wcCharsetName;
2083 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2084
2085 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2086               : m_name(name)
2087 {
2088     m_minMBCharWidth = 0;
2089
2090     // check for charset that represents wchar_t:
2091     if ( ms_wcCharsetName.empty() )
2092     {
2093         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2094
2095 #if wxUSE_FONTMAP
2096         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2097 #else // !wxUSE_FONTMAP
2098         static const wxChar *names_static[] =
2099         {
2100 #if SIZEOF_WCHAR_T == 4
2101             _T("UCS-4"),
2102 #elif SIZEOF_WCHAR_T = 2
2103             _T("UCS-2"),
2104 #endif
2105             NULL
2106         };
2107         const wxChar **names = names_static;
2108 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2109
2110         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2111         {
2112             const wxString nameCS(*names);
2113
2114             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2115             wxString nameXE(nameCS);
2116
2117 #ifdef WORDS_BIGENDIAN
2118                 nameXE += _T("BE");
2119 #else // little endian
2120                 nameXE += _T("LE");
2121 #endif
2122
2123             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2124                        nameXE.c_str());
2125
2126             m2w = iconv_open(nameXE.ToAscii(), name);
2127             if ( m2w == ICONV_T_INVALID )
2128             {
2129                 // try charset w/o bytesex info (e.g. "UCS4")
2130                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2131                            nameCS.c_str());
2132                 m2w = iconv_open(nameCS.ToAscii(), name);
2133
2134                 // and check for bytesex ourselves:
2135                 if ( m2w != ICONV_T_INVALID )
2136                 {
2137                     char    buf[2], *bufPtr;
2138                     wchar_t wbuf[2];
2139                     size_t  insz, outsz;
2140                     size_t  res;
2141
2142                     buf[0] = 'A';
2143                     buf[1] = 0;
2144                     wbuf[0] = 0;
2145                     insz = 2;
2146                     outsz = SIZEOF_WCHAR_T * 2;
2147                     char* wbufPtr = (char*)wbuf;
2148                     bufPtr = buf;
2149
2150                     res = iconv(
2151                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2152                         &wbufPtr, &outsz);
2153
2154                     if (ICONV_FAILED(res, insz))
2155                     {
2156                         wxLogLastError(wxT("iconv"));
2157                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2158                                    nameCS.c_str());
2159                     }
2160                     else // ok, can convert to this encoding, remember it
2161                     {
2162                         ms_wcCharsetName = nameCS;
2163                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2164                     }
2165                 }
2166             }
2167             else // use charset not requiring byte swapping
2168             {
2169                 ms_wcCharsetName = nameXE;
2170             }
2171         }
2172
2173         wxLogTrace(TRACE_STRCONV,
2174                    wxT("iconv wchar_t charset is \"%s\"%s"),
2175                    ms_wcCharsetName.empty() ? wxString("<none>")
2176                                             : ms_wcCharsetName,
2177                    ms_wcNeedsSwap ? _T(" (needs swap)")
2178                                   : _T(""));
2179     }
2180     else // we already have ms_wcCharsetName
2181     {
2182         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2183     }
2184
2185     if ( ms_wcCharsetName.empty() )
2186     {
2187         w2m = ICONV_T_INVALID;
2188     }
2189     else
2190     {
2191         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2192         if ( w2m == ICONV_T_INVALID )
2193         {
2194             wxLogTrace(TRACE_STRCONV,
2195                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2196                        ms_wcCharsetName.c_str(), name);
2197         }
2198     }
2199 }
2200
2201 wxMBConv_iconv::~wxMBConv_iconv()
2202 {
2203     if ( m2w != ICONV_T_INVALID )
2204         iconv_close(m2w);
2205     if ( w2m != ICONV_T_INVALID )
2206         iconv_close(w2m);
2207 }
2208
2209 size_t
2210 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2211                         const char *src, size_t srcLen) const
2212 {
2213     if ( srcLen == wxNO_LEN )
2214     {
2215         // find the string length: notice that must be done differently for
2216         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2217         // consecutive NULs
2218         const size_t nulLen = GetMBNulLen();
2219         switch ( nulLen )
2220         {
2221             default:
2222                 return wxCONV_FAILED;
2223
2224             case 1:
2225                 srcLen = strlen(src); // arguably more optimized than our version
2226                 break;
2227
2228             case 2:
2229             case 4:
2230                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2231                 // but they also have to start at character boundary and not
2232                 // span two adjacent characters
2233                 const char *p;
2234                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2235                     ;
2236                 srcLen = p - src;
2237                 break;
2238         }
2239
2240         // when we're determining the length of the string ourselves we count
2241         // the terminating NUL(s) as part of it and always NUL-terminate the
2242         // output
2243         srcLen += nulLen;
2244     }
2245
2246     // we express length in the number of (wide) characters but iconv always
2247     // counts buffer sizes it in bytes
2248     dstLen *= SIZEOF_WCHAR_T;
2249
2250 #if wxUSE_THREADS
2251     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2252     //     Unfortunately there are a couple of global wxCSConv objects such as
2253     //     wxConvLocal that are used all over wx code, so we have to make sure
2254     //     the handle is used by at most one thread at the time. Otherwise
2255     //     only a few wx classes would be safe to use from non-main threads
2256     //     as MB<->WC conversion would fail "randomly".
2257     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2258 #endif // wxUSE_THREADS
2259
2260     size_t res, cres;
2261     const char *pszPtr = src;
2262
2263     if ( dst )
2264     {
2265         char* bufPtr = (char*)dst;
2266
2267         // have destination buffer, convert there
2268         size_t dstLenOrig = dstLen;
2269         cres = iconv(m2w,
2270                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2271                      &bufPtr, &dstLen);
2272
2273         // convert the number of bytes converted as returned by iconv to the
2274         // number of (wide) characters converted that we need
2275         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2276
2277         if (ms_wcNeedsSwap)
2278         {
2279             // convert to native endianness
2280             for ( unsigned i = 0; i < res; i++ )
2281                 dst[i] = WC_BSWAP(dst[i]);
2282         }
2283     }
2284     else // no destination buffer
2285     {
2286         // convert using temp buffer to calculate the size of the buffer needed
2287         wchar_t tbuf[8];
2288         res = 0;
2289
2290         do
2291         {
2292             char* bufPtr = (char*)tbuf;
2293             dstLen = 8 * SIZEOF_WCHAR_T;
2294
2295             cres = iconv(m2w,
2296                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2297                          &bufPtr, &dstLen );
2298
2299             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2300         }
2301         while ((cres == (size_t)-1) && (errno == E2BIG));
2302     }
2303
2304     if (ICONV_FAILED(cres, srcLen))
2305     {
2306         //VS: it is ok if iconv fails, hence trace only
2307         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2308         return wxCONV_FAILED;
2309     }
2310
2311     return res;
2312 }
2313
2314 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2315                                  const wchar_t *src, size_t srcLen) const
2316 {
2317 #if wxUSE_THREADS
2318     // NB: explained in MB2WC
2319     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2320 #endif
2321
2322     if ( srcLen == wxNO_LEN )
2323         srcLen = wxWcslen(src) + 1;
2324
2325     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2326     size_t outbuflen = dstLen;
2327     size_t res, cres;
2328
2329     wchar_t *tmpbuf = 0;
2330
2331     if (ms_wcNeedsSwap)
2332     {
2333         // need to copy to temp buffer to switch endianness
2334         // (doing WC_BSWAP twice on the original buffer won't help, as it
2335         //  could be in read-only memory, or be accessed in some other thread)
2336         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2337         for ( size_t i = 0; i < srcLen; i++ )
2338             tmpbuf[i] = WC_BSWAP(src[i]);
2339
2340         tmpbuf[srcLen] = L'\0';
2341         src = tmpbuf;
2342     }
2343
2344     char* inbuf = (char*)src;
2345     if ( dst )
2346     {
2347         // have destination buffer, convert there
2348         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2349
2350         res = dstLen - outbuflen;
2351     }
2352     else // no destination buffer
2353     {
2354         // convert using temp buffer to calculate the size of the buffer needed
2355         char tbuf[16];
2356         res = 0;
2357         do
2358         {
2359             dst = tbuf;
2360             outbuflen = 16;
2361
2362             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2363
2364             res += 16 - outbuflen;
2365         }
2366         while ((cres == (size_t)-1) && (errno == E2BIG));
2367     }
2368
2369     if (ms_wcNeedsSwap)
2370     {
2371         free(tmpbuf);
2372     }
2373
2374     if (ICONV_FAILED(cres, inbuflen))
2375     {
2376         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2377         return wxCONV_FAILED;
2378     }
2379
2380     return res;
2381 }
2382
2383 size_t wxMBConv_iconv::GetMBNulLen() const
2384 {
2385     if ( m_minMBCharWidth == 0 )
2386     {
2387         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2388
2389 #if wxUSE_THREADS
2390         // NB: explained in MB2WC
2391         wxMutexLocker lock(self->m_iconvMutex);
2392 #endif
2393
2394         const wchar_t *wnul = L"";
2395         char buf[8]; // should be enough for NUL in any encoding
2396         size_t inLen = sizeof(wchar_t),
2397                outLen = WXSIZEOF(buf);
2398         char *inBuff = (char *)wnul;
2399         char *outBuff = buf;
2400         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2401         {
2402             self->m_minMBCharWidth = (size_t)-1;
2403         }
2404         else // ok
2405         {
2406             self->m_minMBCharWidth = outBuff - buf;
2407         }
2408     }
2409
2410     return m_minMBCharWidth;
2411 }
2412
2413 #if wxUSE_UNICODE_UTF8
2414 bool wxMBConv_iconv::IsUTF8() const
2415 {
2416     return wxStricmp(m_name, "UTF-8") == 0 ||
2417            wxStricmp(m_name, "UTF8") == 0;
2418 }
2419 #endif
2420
2421 #endif // HAVE_ICONV
2422
2423
2424 // ============================================================================
2425 // Win32 conversion classes
2426 // ============================================================================
2427
2428 #ifdef wxHAVE_WIN32_MB2WC
2429
2430 // from utils.cpp
2431 #if wxUSE_FONTMAP
2432 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2433 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2434 #endif
2435
2436 class wxMBConv_win32 : public wxMBConv
2437 {
2438 public:
2439     wxMBConv_win32()
2440     {
2441         m_CodePage = CP_ACP;
2442         m_minMBCharWidth = 0;
2443     }
2444
2445     wxMBConv_win32(const wxMBConv_win32& conv)
2446         : wxMBConv()
2447     {
2448         m_CodePage = conv.m_CodePage;
2449         m_minMBCharWidth = conv.m_minMBCharWidth;
2450     }
2451
2452 #if wxUSE_FONTMAP
2453     wxMBConv_win32(const char* name)
2454     {
2455         m_CodePage = wxCharsetToCodepage(name);
2456         m_minMBCharWidth = 0;
2457     }
2458
2459     wxMBConv_win32(wxFontEncoding encoding)
2460     {
2461         m_CodePage = wxEncodingToCodepage(encoding);
2462         m_minMBCharWidth = 0;
2463     }
2464 #endif // wxUSE_FONTMAP
2465
2466     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2467     {
2468         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2469         // the behaviour is not compatible with the Unix version (using iconv)
2470         // and break the library itself, e.g. wxTextInputStream::NextChar()
2471         // wouldn't work if reading an incomplete MB char didn't result in an
2472         // error
2473         //
2474         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2475         // Win XP or newer and it is not supported for UTF-[78] so we always
2476         // use our own conversions in this case. See
2477         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2478         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2479         if ( m_CodePage == CP_UTF8 )
2480         {
2481             return wxMBConvUTF8().MB2WC(buf, psz, n);
2482         }
2483
2484         if ( m_CodePage == CP_UTF7 )
2485         {
2486             return wxMBConvUTF7().MB2WC(buf, psz, n);
2487         }
2488
2489         int flags = 0;
2490         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2491                 IsAtLeastWin2kSP4() )
2492         {
2493             flags = MB_ERR_INVALID_CHARS;
2494         }
2495
2496         const size_t len = ::MultiByteToWideChar
2497                              (
2498                                 m_CodePage,     // code page
2499                                 flags,          // flags: fall on error
2500                                 psz,            // input string
2501                                 -1,             // its length (NUL-terminated)
2502                                 buf,            // output string
2503                                 buf ? n : 0     // size of output buffer
2504                              );
2505         if ( !len )
2506         {
2507             // function totally failed
2508             return wxCONV_FAILED;
2509         }
2510
2511         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2512         // check if we succeeded, by doing a double trip:
2513         if ( !flags && buf )
2514         {
2515             const size_t mbLen = strlen(psz);
2516             wxCharBuffer mbBuf(mbLen);
2517             if ( ::WideCharToMultiByte
2518                    (
2519                       m_CodePage,
2520                       0,
2521                       buf,
2522                       -1,
2523                       mbBuf.data(),
2524                       mbLen + 1,        // size in bytes, not length
2525                       NULL,
2526                       NULL
2527                    ) == 0 ||
2528                   strcmp(mbBuf, psz) != 0 )
2529             {
2530                 // we didn't obtain the same thing we started from, hence
2531                 // the conversion was lossy and we consider that it failed
2532                 return wxCONV_FAILED;
2533             }
2534         }
2535
2536         // note that it returns count of written chars for buf != NULL and size
2537         // of the needed buffer for buf == NULL so in either case the length of
2538         // the string (which never includes the terminating NUL) is one less
2539         return len - 1;
2540     }
2541
2542     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2543     {
2544         /*
2545             we have a problem here: by default, WideCharToMultiByte() may
2546             replace characters unrepresentable in the target code page with bad
2547             quality approximations such as turning "1/2" symbol (U+00BD) into
2548             "1" for the code pages which don't have it and we, obviously, want
2549             to avoid this at any price
2550
2551             the trouble is that this function does it _silently_, i.e. it won't
2552             even tell us whether it did or not... Win98/2000 and higher provide
2553             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2554             we have to resort to a round trip, i.e. check that converting back
2555             results in the same string -- this is, of course, expensive but
2556             otherwise we simply can't be sure to not garble the data.
2557          */
2558
2559         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2560         // it doesn't work with CJK encodings (which we test for rather roughly
2561         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2562         // supporting it
2563         BOOL usedDef wxDUMMY_INITIALIZE(false);
2564         BOOL *pUsedDef;
2565         int flags;
2566         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2567         {
2568             // it's our lucky day
2569             flags = WC_NO_BEST_FIT_CHARS;
2570             pUsedDef = &usedDef;
2571         }
2572         else // old system or unsupported encoding
2573         {
2574             flags = 0;
2575             pUsedDef = NULL;
2576         }
2577
2578         const size_t len = ::WideCharToMultiByte
2579                              (
2580                                 m_CodePage,     // code page
2581                                 flags,          // either none or no best fit
2582                                 pwz,            // input string
2583                                 -1,             // it is (wide) NUL-terminated
2584                                 buf,            // output buffer
2585                                 buf ? n : 0,    // and its size
2586                                 NULL,           // default "replacement" char
2587                                 pUsedDef        // [out] was it used?
2588                              );
2589
2590         if ( !len )
2591         {
2592             // function totally failed
2593             return wxCONV_FAILED;
2594         }
2595
2596         // we did something, check if we really succeeded
2597         if ( flags )
2598         {
2599             // check if the conversion failed, i.e. if any replacements
2600             // were done
2601             if ( usedDef )
2602                 return wxCONV_FAILED;
2603         }
2604         else // we must resort to double tripping...
2605         {
2606             // first we need to ensure that we really have the MB data: this is
2607             // not the case if we're called with NULL buffer, in which case we
2608             // need to do the conversion yet again
2609             wxCharBuffer bufDef;
2610             if ( !buf )
2611             {
2612                 bufDef = wxCharBuffer(len);
2613                 buf = bufDef.data();
2614                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2615                                             buf, len, NULL, NULL) )
2616                     return wxCONV_FAILED;
2617             }
2618
2619             if ( !n )
2620                 n = wcslen(pwz);
2621             wxWCharBuffer wcBuf(n);
2622             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2623                     wcscmp(wcBuf, pwz) != 0 )
2624             {
2625                 // we didn't obtain the same thing we started from, hence
2626                 // the conversion was lossy and we consider that it failed
2627                 return wxCONV_FAILED;
2628             }
2629         }
2630
2631         // see the comment above for the reason of "len - 1"
2632         return len - 1;
2633     }
2634
2635     virtual size_t GetMBNulLen() const
2636     {
2637         if ( m_minMBCharWidth == 0 )
2638         {
2639             int len = ::WideCharToMultiByte
2640                         (
2641                             m_CodePage,     // code page
2642                             0,              // no flags
2643                             L"",            // input string
2644                             1,              // translate just the NUL
2645                             NULL,           // output buffer
2646                             0,              // and its size
2647                             NULL,           // no replacement char
2648                             NULL            // [out] don't care if it was used
2649                         );
2650
2651             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2652             switch ( len )
2653             {
2654                 default:
2655                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2656                     self->m_minMBCharWidth = (size_t)-1;
2657                     break;
2658
2659                 case 0:
2660                     self->m_minMBCharWidth = (size_t)-1;
2661                     break;
2662
2663                 case 1:
2664                 case 2:
2665                 case 4:
2666                     self->m_minMBCharWidth = len;
2667                     break;
2668             }
2669         }
2670
2671         return m_minMBCharWidth;
2672     }
2673
2674     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2675
2676     bool IsOk() const { return m_CodePage != -1; }
2677
2678 private:
2679     static bool CanUseNoBestFit()
2680     {
2681         static int s_isWin98Or2k = -1;
2682
2683         if ( s_isWin98Or2k == -1 )
2684         {
2685             int verMaj, verMin;
2686             switch ( wxGetOsVersion(&verMaj, &verMin) )
2687             {
2688                 case wxOS_WINDOWS_9X:
2689                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2690                     break;
2691
2692                 case wxOS_WINDOWS_NT:
2693                     s_isWin98Or2k = verMaj >= 5;
2694                     break;
2695
2696                 default:
2697                     // unknown: be conservative by default
2698                     s_isWin98Or2k = 0;
2699                     break;
2700             }
2701
2702             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2703         }
2704
2705         return s_isWin98Or2k == 1;
2706     }
2707
2708     static bool IsAtLeastWin2kSP4()
2709     {
2710 #ifdef __WXWINCE__
2711         return false;
2712 #else
2713         static int s_isAtLeastWin2kSP4 = -1;
2714
2715         if ( s_isAtLeastWin2kSP4 == -1 )
2716         {
2717             OSVERSIONINFOEX ver;
2718
2719             memset(&ver, 0, sizeof(ver));
2720             ver.dwOSVersionInfoSize = sizeof(ver);
2721             GetVersionEx((OSVERSIONINFO*)&ver);
2722
2723             s_isAtLeastWin2kSP4 =
2724               ((ver.dwMajorVersion > 5) || // Vista+
2725                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2726                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2727                ver.wServicePackMajor >= 4)) // 2000 SP4+
2728               ? 1 : 0;
2729         }
2730
2731         return s_isAtLeastWin2kSP4 == 1;
2732 #endif
2733     }
2734
2735
2736     // the code page we're working with
2737     long m_CodePage;
2738
2739     // cached result of GetMBNulLen(), set to 0 initially meaning
2740     // "unknown"
2741     size_t m_minMBCharWidth;
2742 };
2743
2744 #endif // wxHAVE_WIN32_MB2WC
2745
2746
2747 // ============================================================================
2748 // wxEncodingConverter based conversion classes
2749 // ============================================================================
2750
2751 #if wxUSE_FONTMAP
2752
2753 class wxMBConv_wxwin : public wxMBConv
2754 {
2755 private:
2756     void Init()
2757     {
2758         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2759         // The wxMBConv_cf class does a better job.
2760         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2761                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2762                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2763     }
2764
2765 public:
2766     // temporarily just use wxEncodingConverter stuff,
2767     // so that it works while a better implementation is built
2768     wxMBConv_wxwin(const char* name)
2769     {
2770         if (name)
2771             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2772         else
2773             m_enc = wxFONTENCODING_SYSTEM;
2774
2775         Init();
2776     }
2777
2778     wxMBConv_wxwin(wxFontEncoding enc)
2779     {
2780         m_enc = enc;
2781
2782         Init();
2783     }
2784
2785     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2786     {
2787         size_t inbuf = strlen(psz);
2788         if (buf)
2789         {
2790             if (!m2w.Convert(psz, buf))
2791                 return wxCONV_FAILED;
2792         }
2793         return inbuf;
2794     }
2795
2796     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2797     {
2798         const size_t inbuf = wxWcslen(psz);
2799         if (buf)
2800         {
2801             if (!w2m.Convert(psz, buf))
2802                 return wxCONV_FAILED;
2803         }
2804
2805         return inbuf;
2806     }
2807
2808     virtual size_t GetMBNulLen() const
2809     {
2810         switch ( m_enc )
2811         {
2812             case wxFONTENCODING_UTF16BE:
2813             case wxFONTENCODING_UTF16LE:
2814                 return 2;
2815
2816             case wxFONTENCODING_UTF32BE:
2817             case wxFONTENCODING_UTF32LE:
2818                 return 4;
2819
2820             default:
2821                 return 1;
2822         }
2823     }
2824
2825     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2826
2827     bool IsOk() const { return m_ok; }
2828
2829 public:
2830     wxFontEncoding m_enc;
2831     wxEncodingConverter m2w, w2m;
2832
2833 private:
2834     // were we initialized successfully?
2835     bool m_ok;
2836
2837     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2838 };
2839
2840 // make the constructors available for unit testing
2841 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2842 {
2843     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2844     if ( !result->IsOk() )
2845     {
2846         delete result;
2847         return 0;
2848     }
2849
2850     return result;
2851 }
2852
2853 #endif // wxUSE_FONTMAP
2854
2855 // ============================================================================
2856 // wxCSConv implementation
2857 // ============================================================================
2858
2859 void wxCSConv::Init()
2860 {
2861     m_name = NULL;
2862     m_convReal =  NULL;
2863     m_deferred = true;
2864 }
2865
2866 wxCSConv::wxCSConv(const wxString& charset)
2867 {
2868     Init();
2869
2870     if ( !charset.empty() )
2871     {
2872         SetName(charset.ToAscii());
2873     }
2874
2875 #if wxUSE_FONTMAP
2876     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2877 #else
2878     m_encoding = wxFONTENCODING_SYSTEM;
2879 #endif
2880 }
2881
2882 wxCSConv::wxCSConv(wxFontEncoding encoding)
2883 {
2884     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2885     {
2886         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2887
2888         encoding = wxFONTENCODING_SYSTEM;
2889     }
2890
2891     Init();
2892
2893     m_encoding = encoding;
2894 }
2895
2896 wxCSConv::~wxCSConv()
2897 {
2898     Clear();
2899 }
2900
2901 wxCSConv::wxCSConv(const wxCSConv& conv)
2902         : wxMBConv()
2903 {
2904     Init();
2905
2906     SetName(conv.m_name);
2907     m_encoding = conv.m_encoding;
2908 }
2909
2910 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2911 {
2912     Clear();
2913
2914     SetName(conv.m_name);
2915     m_encoding = conv.m_encoding;
2916
2917     return *this;
2918 }
2919
2920 void wxCSConv::Clear()
2921 {
2922     free(m_name);
2923     delete m_convReal;
2924
2925     m_name = NULL;
2926     m_convReal = NULL;
2927 }
2928
2929 void wxCSConv::SetName(const char *charset)
2930 {
2931     if (charset)
2932     {
2933         m_name = wxStrdup(charset);
2934         m_deferred = true;
2935     }
2936 }
2937
2938 #if wxUSE_FONTMAP
2939
2940 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2941                      wxEncodingNameCache );
2942
2943 static wxEncodingNameCache gs_nameCache;
2944 #endif
2945
2946 wxMBConv *wxCSConv::DoCreate() const
2947 {
2948 #if wxUSE_FONTMAP
2949     wxLogTrace(TRACE_STRCONV,
2950                wxT("creating conversion for %s"),
2951                (m_name ? m_name
2952                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2953 #endif // wxUSE_FONTMAP
2954
2955     // check for the special case of ASCII or ISO8859-1 charset: as we have
2956     // special knowledge of it anyhow, we don't need to create a special
2957     // conversion object
2958     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2959             m_encoding == wxFONTENCODING_DEFAULT )
2960     {
2961         // don't convert at all
2962         return NULL;
2963     }
2964
2965     // we trust OS to do conversion better than we can so try external
2966     // conversion methods first
2967     //
2968     // the full order is:
2969     //      1. OS conversion (iconv() under Unix or Win32 API)
2970     //      2. hard coded conversions for UTF
2971     //      3. wxEncodingConverter as fall back
2972
2973     // step (1)
2974 #ifdef HAVE_ICONV
2975 #if !wxUSE_FONTMAP
2976     if ( m_name )
2977 #endif // !wxUSE_FONTMAP
2978     {
2979 #if wxUSE_FONTMAP
2980         wxFontEncoding encoding(m_encoding);
2981 #endif
2982
2983         if ( m_name )
2984         {
2985             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
2986             if ( conv->IsOk() )
2987                 return conv;
2988
2989             delete conv;
2990
2991 #if wxUSE_FONTMAP
2992             encoding =
2993                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2994 #endif // wxUSE_FONTMAP
2995         }
2996 #if wxUSE_FONTMAP
2997         {
2998             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
2999             if ( it != gs_nameCache.end() )
3000             {
3001                 if ( it->second.empty() )
3002                     return NULL;
3003
3004                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3005                 if ( conv->IsOk() )
3006                     return conv;
3007
3008                 delete conv;
3009             }
3010
3011             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3012             // CS : in case this does not return valid names (eg for MacRoman)
3013             // encoding got a 'failure' entry in the cache all the same,
3014             // although it just has to be created using a different method, so
3015             // only store failed iconv creation attempts (or perhaps we
3016             // shoulnd't do this at all ?)
3017             if ( names[0] != NULL )
3018             {
3019                 for ( ; *names; ++names )
3020                 {
3021                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3022                     //             will need changes that will obsolete this
3023                     wxString name(*names);
3024                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3025                     if ( conv->IsOk() )
3026                     {
3027                         gs_nameCache[encoding] = *names;
3028                         return conv;
3029                     }
3030
3031                     delete conv;
3032                 }
3033
3034                 gs_nameCache[encoding] = _T(""); // cache the failure
3035             }
3036         }
3037 #endif // wxUSE_FONTMAP
3038     }
3039 #endif // HAVE_ICONV
3040
3041 #ifdef wxHAVE_WIN32_MB2WC
3042     {
3043 #if wxUSE_FONTMAP
3044         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3045                                       : new wxMBConv_win32(m_encoding);
3046         if ( conv->IsOk() )
3047             return conv;
3048
3049         delete conv;
3050 #else
3051         return NULL;
3052 #endif
3053     }
3054 #endif // wxHAVE_WIN32_MB2WC
3055
3056 #ifdef __DARWIN__
3057     {
3058         // leave UTF16 and UTF32 to the built-ins of wx
3059         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3060             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3061         {
3062 #if wxUSE_FONTMAP
3063             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3064                                           : new wxMBConv_cf(m_encoding);
3065 #else
3066             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3067 #endif
3068
3069             if ( conv->IsOk() )
3070                  return conv;
3071
3072             delete conv;
3073         }
3074     }
3075 #endif // __DARWIN__
3076
3077     // step (2)
3078     wxFontEncoding enc = m_encoding;
3079 #if wxUSE_FONTMAP
3080     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3081     {
3082         // use "false" to suppress interactive dialogs -- we can be called from
3083         // anywhere and popping up a dialog from here is the last thing we want to
3084         // do
3085         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3086     }
3087 #endif // wxUSE_FONTMAP
3088
3089     switch ( enc )
3090     {
3091         case wxFONTENCODING_UTF7:
3092              return new wxMBConvUTF7;
3093
3094         case wxFONTENCODING_UTF8:
3095              return new wxMBConvUTF8;
3096
3097         case wxFONTENCODING_UTF16BE:
3098              return new wxMBConvUTF16BE;
3099
3100         case wxFONTENCODING_UTF16LE:
3101              return new wxMBConvUTF16LE;
3102
3103         case wxFONTENCODING_UTF32BE:
3104              return new wxMBConvUTF32BE;
3105
3106         case wxFONTENCODING_UTF32LE:
3107              return new wxMBConvUTF32LE;
3108
3109         default:
3110              // nothing to do but put here to suppress gcc warnings
3111              break;
3112     }
3113
3114     // step (3)
3115 #if wxUSE_FONTMAP
3116     {
3117         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3118                                       : new wxMBConv_wxwin(m_encoding);
3119         if ( conv->IsOk() )
3120             return conv;
3121
3122         delete conv;
3123     }
3124 #endif // wxUSE_FONTMAP
3125
3126     // NB: This is a hack to prevent deadlock. What could otherwise happen
3127     //     in Unicode build: wxConvLocal creation ends up being here
3128     //     because of some failure and logs the error. But wxLog will try to
3129     //     attach a timestamp, for which it will need wxConvLocal (to convert
3130     //     time to char* and then wchar_t*), but that fails, tries to log the
3131     //     error, but wxLog has an (already locked) critical section that
3132     //     guards the static buffer.
3133     static bool alreadyLoggingError = false;
3134     if (!alreadyLoggingError)
3135     {
3136         alreadyLoggingError = true;
3137         wxLogError(_("Cannot convert from the charset '%s'!"),
3138                    m_name ? m_name
3139                       :
3140 #if wxUSE_FONTMAP
3141                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3142 #else // !wxUSE_FONTMAP
3143                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3144 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3145               );
3146
3147         alreadyLoggingError = false;
3148     }
3149
3150     return NULL;
3151 }
3152
3153 void wxCSConv::CreateConvIfNeeded() const
3154 {
3155     if ( m_deferred )
3156     {
3157         wxCSConv *self = (wxCSConv *)this; // const_cast
3158
3159         // if we don't have neither the name nor the encoding, use the default
3160         // encoding for this system
3161         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3162         {
3163 #if wxUSE_INTL
3164             self->m_encoding = wxLocale::GetSystemEncoding();
3165 #else
3166             // fallback to some reasonable default:
3167             self->m_encoding = wxFONTENCODING_ISO8859_1;
3168 #endif // wxUSE_INTL
3169         }
3170
3171         self->m_convReal = DoCreate();
3172         self->m_deferred = false;
3173     }
3174 }
3175
3176 bool wxCSConv::IsOk() const
3177 {
3178     CreateConvIfNeeded();
3179
3180     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3181     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3182         return true; // always ok as we do it ourselves
3183
3184     // m_convReal->IsOk() is called at its own creation, so we know it must
3185     // be ok if m_convReal is non-NULL
3186     return m_convReal != NULL;
3187 }
3188
3189 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3190                          const char *src, size_t srcLen) const
3191 {
3192     CreateConvIfNeeded();
3193
3194     if (m_convReal)
3195         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3196
3197     // latin-1 (direct)
3198     if ( srcLen == wxNO_LEN )
3199         srcLen = strlen(src) + 1; // take trailing NUL too
3200
3201     if ( dst )
3202     {
3203         if ( dstLen < srcLen )
3204             return wxCONV_FAILED;
3205
3206         for ( size_t n = 0; n < srcLen; n++ )
3207             dst[n] = (unsigned char)(src[n]);
3208     }
3209
3210     return srcLen;
3211 }
3212
3213 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3214                            const wchar_t *src, size_t srcLen) const
3215 {
3216     CreateConvIfNeeded();
3217
3218     if (m_convReal)
3219         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3220
3221     // latin-1 (direct)
3222     if ( srcLen == wxNO_LEN )
3223         srcLen = wxWcslen(src) + 1;
3224
3225     if ( dst )
3226     {
3227         if ( dstLen < srcLen )
3228             return wxCONV_FAILED;
3229
3230         for ( size_t n = 0; n < srcLen; n++ )
3231         {
3232             if ( src[n] > 0xFF )
3233                 return wxCONV_FAILED;
3234
3235             dst[n] = (char)src[n];
3236         }
3237
3238     }
3239     else // still need to check the input validity
3240     {
3241         for ( size_t n = 0; n < srcLen; n++ )
3242         {
3243             if ( src[n] > 0xFF )
3244                 return wxCONV_FAILED;
3245         }
3246     }
3247
3248     return srcLen;
3249 }
3250
3251 size_t wxCSConv::GetMBNulLen() const
3252 {
3253     CreateConvIfNeeded();
3254
3255     if ( m_convReal )
3256     {
3257         return m_convReal->GetMBNulLen();
3258     }
3259
3260     // otherwise, we are ISO-8859-1
3261     return 1;
3262 }
3263
3264 #if wxUSE_UNICODE_UTF8
3265 bool wxCSConv::IsUTF8() const
3266 {
3267     CreateConvIfNeeded();
3268
3269     if ( m_convReal )
3270     {
3271         return m_convReal->IsUTF8();
3272     }
3273
3274     // otherwise, we are ISO-8859-1
3275     return false;
3276 }
3277 #endif
3278
3279
3280 #if wxUSE_UNICODE
3281
3282 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3283 {
3284     if ( !s )
3285         return wxWCharBuffer();
3286
3287     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3288     if ( !wbuf )
3289         wbuf = wxMBConvUTF8().cMB2WX(s);
3290     if ( !wbuf )
3291         wbuf = wxConvISO8859_1.cMB2WX(s);
3292
3293     return wbuf;
3294 }
3295
3296 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3297 {
3298     if ( !ws )
3299         return wxCharBuffer();
3300
3301     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3302     if ( !buf )
3303         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3304
3305     return buf;
3306 }
3307
3308 #endif // wxUSE_UNICODE
3309
3310 // ----------------------------------------------------------------------------
3311 // globals
3312 // ----------------------------------------------------------------------------
3313
3314 // NB: The reason why we create converted objects in this convoluted way,
3315 //     using a factory function instead of global variable, is that they
3316 //     may be used at static initialization time (some of them are used by
3317 //     wxString ctors and there may be a global wxString object). In other
3318 //     words, possibly _before_ the converter global object would be
3319 //     initialized.
3320
3321 #undef wxConvLibc
3322 #undef wxConvUTF8
3323 #undef wxConvUTF7
3324 #undef wxConvLocal
3325 #undef wxConvISO8859_1
3326
3327 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3328     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3329     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3330     {                                                                   \
3331         static impl_klass name##Obj ctor_args;                          \
3332         return &name##Obj;                                              \
3333     }                                                                   \
3334     /* this ensures that all global converter objects are created */    \
3335     /* by the time static initialization is done, i.e. before any */    \
3336     /* thread is launched: */                                           \
3337     static klass* gs_##name##instance = wxGet_##name##Ptr()
3338
3339 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3340     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3341
3342 #ifdef __WINDOWS__
3343     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3344 #else
3345     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3346 #endif
3347
3348 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3349 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3350 //     provokes an error message about "not enough macro parameters"; and we
3351 //     can't use "()" here as the name##Obj declaration would be parsed as a
3352 //     function declaration then, so use a semicolon and live with an extra
3353 //     empty statement (and hope that no compilers warns about this)
3354 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3355 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3356
3357 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3358 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3359
3360 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3361 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3362
3363 #ifdef __DARWIN__
3364 // The xnu kernel always communicates file paths in decomposed UTF-8.
3365 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3366 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3367 #endif
3368
3369 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3370 #ifdef __DARWIN__
3371                                     &wxConvMacUTF8DObj;
3372 #else // !__DARWIN__
3373                                     wxGet_wxConvLibcPtr();
3374 #endif // __DARWIN__/!__DARWIN__
3375
3376 #else // !wxUSE_WCHAR_T
3377
3378 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3379 // stand-ins in absence of wchar_t
3380 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3381                                 wxConvISO8859_1,
3382                                 wxConvLocal,
3383                                 wxConvUTF8;
3384
3385 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T