src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/osx/core/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(wx_reinterpret_cast(const wxUint16 *, *pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existing ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168     //
 169     // moreover, some conversion classes simply can't implement ToWChar()
 170     // directly, the primary example is wxConvLibc: mbstowcs() only handles
 171     // NUL-terminated strings
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     // the idea of this code is straightforward: it converts a NUL-terminated
 213     // chunk of the string during each iteration and updates the output buffer
 214     // with the result
 215     //
 216     // all the complication come from the fact that this function, for
 217     // historical reasons, must behave in 2 subtly different ways when it's
 218     // called with a fixed number of characters and when it's called for the
 219     // entire NUL-terminated string: in the former case (srcEnd == NULL) we
 220     // must count all characters we convert, NUL or not; but in the latter we
 221     // do not count the trailing NUL -- but still count all the NULs inside the
 222     // string
 223     //
 224     // so for the (simple) former case we just always count the trailing NUL,
 225     // but for the latter we need to wait until we see if there is going to be
 226     // another loop iteration and only count it then
 227     for ( ;; )
 228     {
 229         // try to convert the current chunk
 230         size_t lenChunk = MB2WC(NULL, src, 0);
 231         if ( lenChunk == wxCONV_FAILED )
 232             return wxCONV_FAILED;
 233
 234         dstWritten += lenChunk;
 235         if ( !srcEnd )
 236             dstWritten++;
 237
 238         if ( !lenChunk )
 239         {
 240             // nothing left in the input string, conversion succeeded
 241             break;
 242         }
 243
 244         if ( dst )
 245         {
 246             if ( dstWritten > dstLen )
 247                 return wxCONV_FAILED;
 248
 249             // +1 is for trailing NUL
 250             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 251                 return wxCONV_FAILED;
 252
 253             dst += lenChunk;
 254             if ( !srcEnd )
 255                 dst++;
 256         }
 257
 258         if ( !srcEnd )
 259         {
 260             // we convert just one chunk in this case as this is the entire
 261             // string anyhow
 262             break;
 263         }
 264
 265         // advance the input pointer past the end of this chunk
 266         while ( NotAllNULs(src, nulLen) )
 267         {
 268             // notice that we must skip over multiple bytes here as we suppose
 269             // that if NUL takes 2 or 4 bytes, then all the other characters do
 270             // too and so if advanced by a single byte we might erroneously
 271             // detect sequences of NUL bytes in the middle of the input
 272             src += nulLen;
 273         }
 274
 275         src += nulLen; // skipping over its terminator as well
 276
 277         // note that ">=" (and not just "==") is needed here as the terminator
 278         // we skipped just above could be inside or just after the buffer
 279         // delimited by srcEnd
 280         if ( src >= srcEnd )
 281             break;
 282
 283         // if we got here then this wasn't the last chunk in this string and
 284         // hence we must count an extra char for L'\0' even when converting a
 285         // fixed number of characters
 286         if ( srcEnd )
 287         {
 288             dstWritten++;
 289             if ( dst )
 290                 dst++;
 291         }
 292     }
 293
 294     return dstWritten;
 295 }
 296
 297 size_t
 298 wxMBConv::FromWChar(char *dst, size_t dstLen,
 299                     const wchar_t *src, size_t srcLen) const
 300 {
 301     // the number of chars [which would be] written to dst [if it were not NULL]
 302     size_t dstWritten = 0;
 303
 304     // if we don't know its length we have no choice but to assume that it is
 305     // NUL-terminated (notice that it can still be NUL-terminated even if
 306     // explicit length is given but it doesn't change our return value)
 307     const bool isNulTerminated = srcLen == wxNO_LEN;
 308
 309     // make a copy of the input string unless it is already properly
 310     // NUL-terminated
 311     wxWCharBuffer bufTmp;
 312     if ( isNulTerminated )
 313     {
 314         srcLen = wxWcslen(src) + 1;
 315     }
 316     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 317     {
 318         // make a copy in order to properly NUL-terminate the string
 319         bufTmp = wxWCharBuffer(srcLen);
 320         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 321         src = bufTmp;
 322     }
 323
 324     const size_t lenNul = GetMBNulLen();
 325     for ( const wchar_t * const srcEnd = src + srcLen;
 326           src < srcEnd;
 327           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 328     {
 329         // try to convert the current chunk
 330         size_t lenChunk = WC2MB(NULL, src, 0);
 331
 332         if ( lenChunk == wxCONV_FAILED )
 333             return wxCONV_FAILED;
 334
 335         dstWritten += lenChunk;
 336         if ( isNulTerminated )
 337             dstWritten += lenNul;
 338
 339         if ( dst )
 340         {
 341             if ( dstWritten > dstLen )
 342                 return wxCONV_FAILED;
 343
 344             if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
 345                 return wxCONV_FAILED;
 346
 347             dst += lenChunk;
 348             if ( isNulTerminated )
 349                 dst += lenNul;
 350         }
 351     }
 352
 353     return dstWritten;
 354 }
 355
 356 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 357 {
 358     size_t rc = ToWChar(outBuff, outLen, inBuff);
 359     if ( rc != wxCONV_FAILED )
 360     {
 361         // ToWChar() returns the buffer length, i.e. including the trailing
 362         // NUL, while this method doesn't take it into account
 363         rc--;
 364     }
 365
 366     return rc;
 367 }
 368
 369 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 370 {
 371     size_t rc = FromWChar(outBuff, outLen, inBuff);
 372     if ( rc != wxCONV_FAILED )
 373     {
 374         rc -= GetMBNulLen();
 375     }
 376
 377     return rc;
 378 }
 379
 380 wxMBConv::~wxMBConv()
 381 {
 382     // nothing to do here (necessary for Darwin linking probably)
 383 }
 384
 385 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 386 {
 387     if ( psz )
 388     {
 389         // calculate the length of the buffer needed first
 390         const size_t nLen = ToWChar(NULL, 0, psz);
 391         if ( nLen != wxCONV_FAILED )
 392         {
 393             // now do the actual conversion
 394             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 395
 396             // +1 for the trailing NULL
 397             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 398                 return buf;
 399         }
 400     }
 401
 402     return wxWCharBuffer();
 403 }
 404
 405 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 406 {
 407     if ( pwz )
 408     {
 409         const size_t nLen = FromWChar(NULL, 0, pwz);
 410         if ( nLen != wxCONV_FAILED )
 411         {
 412             wxCharBuffer buf(nLen - 1);
 413             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 414                 return buf;
 415         }
 416     }
 417
 418     return wxCharBuffer();
 419 }
 420
 421 const wxWCharBuffer
 422 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 423 {
 424     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 425     if ( dstLen != wxCONV_FAILED )
 426     {
 427         // notice that we allocate space for dstLen+1 wide characters here
 428         // because we want the buffer to always be NUL-terminated, even if the
 429         // input isn't (as otherwise the caller has no way to know its length)
 430         wxWCharBuffer wbuf(dstLen);
 431         wbuf.data()[dstLen] = L'\0';
 432         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 433         {
 434             if ( outLen )
 435             {
 436                 *outLen = dstLen;
 437
 438                 // we also need to handle NUL-terminated input strings
 439                 // specially: for them the output is the length of the string
 440                 // excluding the trailing NUL, however if we're asked to
 441                 // convert a specific number of characters we return the length
 442                 // of the resulting output even if it's NUL-terminated
 443                 if ( inLen == wxNO_LEN )
 444                     (*outLen)--;
 445             }
 446
 447             return wbuf;
 448         }
 449     }
 450
 451     if ( outLen )
 452         *outLen = 0;
 453
 454     return wxWCharBuffer();
 455 }
 456
 457 const wxCharBuffer
 458 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 459 {
 460     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 461     if ( dstLen != wxCONV_FAILED )
 462     {
 463         const size_t nulLen = GetMBNulLen();
 464
 465         // as above, ensure that the buffer is always NUL-terminated, even if
 466         // the input is not
 467         wxCharBuffer buf(dstLen + nulLen - 1);
 468         memset(buf.data() + dstLen, 0, nulLen);
 469         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 470         {
 471             if ( outLen )
 472             {
 473                 *outLen = dstLen;
 474
 475                 if ( inLen == wxNO_LEN )
 476                 {
 477                     // in this case both input and output are NUL-terminated
 478                     // and we're not supposed to count NUL
 479                     *outLen -= nulLen;
 480                 }
 481             }
 482
 483             return buf;
 484         }
 485     }
 486
 487     if ( outLen )
 488         *outLen = 0;
 489
 490     return wxCharBuffer();
 491 }
 492
 493 // ----------------------------------------------------------------------------
 494 // wxMBConvLibc
 495 // ----------------------------------------------------------------------------
 496
 497 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 498 {
 499     return wxMB2WC(buf, psz, n);
 500 }
 501
 502 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 503 {
 504     return wxWC2MB(buf, psz, n);
 505 }
 506
 507 // ----------------------------------------------------------------------------
 508 // wxConvBrokenFileNames
 509 // ----------------------------------------------------------------------------
 510
 511 #ifdef __UNIX__
 512
 513 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 514 {
 515     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 516          wxStricmp(charset, _T("UTF8")) == 0  )
 517         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 518     else
 519         m_conv = new wxCSConv(charset);
 520 }
 521
 522 #endif // __UNIX__
 523
 524 // ----------------------------------------------------------------------------
 525 // UTF-7
 526 // ----------------------------------------------------------------------------
 527
 528 // Implementation (C) 2004 Fredrik Roubert
 529 //
 530 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 531
 532 //
 533 // BASE64 decoding table
 534 //
 535 static const unsigned char utf7unb64[] =
 536 {
 537     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 538     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 539     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 540     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 541     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 542     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 543     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 544     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 545     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 546     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 547     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 548     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 549     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 550     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 551     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 552     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 553     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 554     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 555     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 556     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 557     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 558     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 559     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 560     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 561     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 562     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 563     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 564     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 565     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 566     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 567     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 568     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 569 };
 570
 571 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 572                              const char *src, size_t srcLen) const
 573 {
 574     DecoderState stateOrig,
 575                 *statePtr;
 576     if ( srcLen == wxNO_LEN )
 577     {
 578         // convert the entire string, up to and including the trailing NUL
 579         srcLen = strlen(src) + 1;
 580
 581         // when working on the entire strings we don't update nor use the shift
 582         // state from the previous call
 583         statePtr = &stateOrig;
 584     }
 585     else // when working with partial strings we do use the shift state
 586     {
 587         statePtr = wx_const_cast(DecoderState *, &m_stateDecoder);
 588
 589         // also save the old state to be able to rollback to it on error
 590         stateOrig = m_stateDecoder;
 591     }
 592
 593     // but to simplify the code below we use this variable in both cases
 594     DecoderState& state = *statePtr;
 595
 596
 597     // number of characters [which would have been] written to dst [if it were
 598     // not NULL]
 599     size_t len = 0;
 600
 601     const char * const srcEnd = src + srcLen;
 602
 603     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 604     {
 605         const unsigned char cc = *src++;
 606
 607         if ( state.IsShifted() )
 608         {
 609             const unsigned char dc = utf7unb64[cc];
 610             if ( dc == 0xff )
 611             {
 612                 // end of encoded part, check that nothing was left: there can
 613                 // be up to 4 bits of 0 padding but nothing else (we also need
 614                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
 615                 // encoded sequence must contain an integral number of UTF-16
 616                 // characters)
 617                 if ( state.isLSB || state.bit > 4 ||
 618                         (state.accum & ((1 << state.bit) - 1)) )
 619                 {
 620                     if ( !len )
 621                         state = stateOrig;
 622
 623                     return wxCONV_FAILED;
 624                 }
 625
 626                 state.ToDirect();
 627
 628                 // re-parse this character normally below unless it's '-' which
 629                 // is consumed by the decoder
 630                 if ( cc == '-' )
 631                     continue;
 632             }
 633             else // valid encoded character
 634             {
 635                 // mini base64 decoder: each character is 6 bits
 636                 state.bit += 6;
 637                 state.accum <<= 6;
 638                 state.accum += dc;
 639
 640                 if ( state.bit >= 8 )
 641                 {
 642                     // got the full byte, consume it
 643                     state.bit -= 8;
 644                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 645
 646                     if ( state.isLSB )
 647                     {
 648                         // we've got the full word, output it
 649                         if ( dst )
 650                             *dst++ = (state.msb << 8) | b;
 651                         len++;
 652                         state.isLSB = false;
 653                     }
 654                     else // MSB
 655                     {
 656                         // just store it while we wait for LSB
 657                         state.msb = b;
 658                         state.isLSB = true;
 659                     }
 660                 }
 661             }
 662         }
 663
 664         if ( state.IsDirect() )
 665         {
 666             // start of an encoded segment?
 667             if ( cc == '+' )
 668             {
 669                 if ( *src == '-' )
 670                 {
 671                     // just the encoded plus sign, don't switch to shifted mode
 672                     if ( dst )
 673                         *dst++ = '+';
 674                     len++;
 675                     src++;
 676                 }
 677                 else if ( utf7unb64[(unsigned)*src] == 0xff )
 678                 {
 679                     // empty encoded chunks are not allowed
 680                     if ( !len )
 681                         state = stateOrig;
 682
 683                     return wxCONV_FAILED;
 684                 }
 685                 else // base-64 encoded chunk follows
 686                 {
 687                     state.ToShifted();
 688                 }
 689             }
 690             else // not '+'
 691             {
 692                 // only printable 7 bit ASCII characters (with the exception of
 693                 // NUL, TAB, CR and LF) can be used directly
 694                 if ( cc >= 0x7f || (cc < ' ' &&
 695                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 696                     return wxCONV_FAILED;
 697
 698                 if ( dst )
 699                     *dst++ = cc;
 700                 len++;
 701             }
 702         }
 703     }
 704
 705     if ( !len )
 706     {
 707         // as we didn't read any characters we should be called with the same
 708         // data (followed by some more new data) again later so don't save our
 709         // state
 710         state = stateOrig;
 711
 712         return wxCONV_FAILED;
 713     }
 714
 715     return len;
 716 }
 717
 718 //
 719 // BASE64 encoding table
 720 //
 721 static const unsigned char utf7enb64[] =
 722 {
 723     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 724     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 725     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 726     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 727     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 728     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 729     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 730     '4', '5', '6', '7', '8', '9', '+', '/'
 731 };
 732
 733 //
 734 // UTF-7 encoding table
 735 //
 736 // 0 - Set D (directly encoded characters)
 737 // 1 - Set O (optional direct characters)
 738 // 2 - whitespace characters (optional)
 739 // 3 - special characters
 740 //
 741 static const unsigned char utf7encode[128] =
 742 {
 743     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 744     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 745     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 746     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 747     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 748     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 749     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 750     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 751 };
 752
 753 static inline bool wxIsUTF7Direct(wchar_t wc)
 754 {
 755     return wc < 0x80 && utf7encode[wc] < 1;
 756 }
 757
 758 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 759                                const wchar_t *src, size_t srcLen) const
 760 {
 761     EncoderState stateOrig,
 762                 *statePtr;
 763     if ( srcLen == wxNO_LEN )
 764     {
 765         // we don't apply the stored state when operating on entire strings at
 766         // once
 767         statePtr = &stateOrig;
 768
 769         srcLen = wxWcslen(src) + 1;
 770     }
 771     else // do use the mode we left the output in previously
 772     {
 773         stateOrig = m_stateEncoder;
 774         statePtr = wx_const_cast(EncoderState *, &m_stateEncoder);
 775     }
 776
 777     EncoderState& state = *statePtr;
 778
 779
 780     size_t len = 0;
 781
 782     const wchar_t * const srcEnd = src + srcLen;
 783     while ( src < srcEnd && (!dst || len < dstLen) )
 784     {
 785         wchar_t cc = *src++;
 786         if ( wxIsUTF7Direct(cc) )
 787         {
 788             if ( state.IsShifted() )
 789             {
 790                 // pad with zeros the last encoded block if necessary
 791                 if ( state.bit )
 792                 {
 793                     if ( dst )
 794                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 795                     len++;
 796                 }
 797
 798                 state.ToDirect();
 799
 800                 if ( dst )
 801                     *dst++ = '-';
 802                 len++;
 803             }
 804
 805             if ( dst )
 806                 *dst++ = (char)cc;
 807             len++;
 808         }
 809         else if ( cc == '+' && state.IsDirect() )
 810         {
 811             if ( dst )
 812             {
 813                 *dst++ = '+';
 814                 *dst++ = '-';
 815             }
 816
 817             len += 2;
 818         }
 819 #ifndef WC_UTF16
 820         else if (((wxUint32)cc) > 0xffff)
 821         {
 822             // no surrogate pair generation (yet?)
 823             return wxCONV_FAILED;
 824         }
 825 #endif
 826         else
 827         {
 828             if ( state.IsDirect() )
 829             {
 830                 state.ToShifted();
 831
 832                 if ( dst )
 833                     *dst++ = '+';
 834                 len++;
 835             }
 836
 837             // BASE64 encode string
 838             for ( ;; )
 839             {
 840                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 841                 {
 842                     state.accum <<= 8;
 843                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 844
 845                     for (state.bit += 8; state.bit >= 6; )
 846                     {
 847                         state.bit -= 6;
 848                         if ( dst )
 849                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 850                         len++;
 851                     }
 852                 }
 853
 854                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 855                     break;
 856
 857                 src++;
 858             }
 859         }
 860     }
 861
 862     // we need to restore the original encoder state if we were called just to
 863     // calculate the amount of space needed as we will presumably be called
 864     // again to really convert the data now
 865     if ( !dst )
 866         state = stateOrig;
 867
 868     return len;
 869 }
 870
 871 // ----------------------------------------------------------------------------
 872 // UTF-8
 873 // ----------------------------------------------------------------------------
 874
 875 static const wxUint32 utf8_max[]=
 876     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 877
 878 // boundaries of the private use area we use to (temporarily) remap invalid
 879 // characters invalid in a UTF-8 encoded string
 880 const wxUint32 wxUnicodePUA = 0x100000;
 881 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 882
 883 // this table gives the length of the UTF-8 encoding from its first character:
 884 const unsigned char tableUtf8Lengths[256] = {
 885     // single-byte sequences (ASCII):
 886     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 887     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 888     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 889     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 890     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 891     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 892     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 893     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 894
 895     // these are invalid:
 896     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 897     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 898     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 899     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 900     0, 0,                                            // C0,C1
 901
 902     // two-byte sequences:
 903           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 904     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 905
 906     // three-byte sequences:
 907     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 908
 909     // four-byte sequences:
 910     4, 4, 4, 4, 4,                                   // F0..F4
 911
 912     // these are invalid again (5- or 6-byte
 913     // sequences and sequences for code points
 914     // above U+10FFFF, as restricted by RFC 3629):
 915                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 916 };
 917
 918 size_t
 919 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 920                             const char *src, size_t srcLen) const
 921 {
 922     wchar_t *out = dstLen ? dst : NULL;
 923     size_t written = 0;
 924
 925     if ( srcLen == wxNO_LEN )
 926         srcLen = strlen(src) + 1;
 927
 928     for ( const char *p = src; ; p++ )
 929     {
 930         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 931         {
 932             // all done successfully, just add the trailing NULL if we are not
 933             // using explicit length
 934             if ( srcLen == wxNO_LEN )
 935             {
 936                 if ( out )
 937                 {
 938                     if ( !dstLen )
 939                         break;
 940
 941                     *out = L'\0';
 942                 }
 943
 944                 written++;
 945             }
 946
 947             return written;
 948         }
 949
 950         if ( out && !dstLen-- )
 951             break;
 952
 953         wxUint32 code;
 954         unsigned char c = *p;
 955
 956         if ( c < 0x80 )
 957         {
 958             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 959                 break;
 960
 961             if ( srcLen != wxNO_LEN )
 962                 srcLen--;
 963
 964             code = c;
 965         }
 966         else
 967         {
 968             unsigned len = tableUtf8Lengths[c];
 969             if ( !len )
 970                 break;
 971
 972             if ( srcLen < len ) // the test works for wxNO_LEN too
 973                 break;
 974
 975             if ( srcLen != wxNO_LEN )
 976                 srcLen -= len;
 977
 978             //   Char. number range   |        UTF-8 octet sequence
 979             //      (hexadecimal)     |              (binary)
 980             //  ----------------------+----------------------------------------
 981             //  0000 0000 - 0000 007F | 0xxxxxxx
 982             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 983             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 984             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 985             //
 986             //  Code point value is stored in bits marked with 'x',
 987             //  lowest-order bit of the value on the right side in the diagram
 988             //  above.                                         (from RFC 3629)
 989
 990             // mask to extract lead byte's value ('x' bits above), by sequence
 991             // length:
 992             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 993
 994             // mask and value of lead byte's most significant bits, by length:
 995             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 996             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 997
 998             len--; // it's more convenient to work with 0-based length here
 999
1000             // extract the lead byte's value bits:
1001             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1002                 break;
1003
1004             code = c & leadValueMask[len];
1005
1006             // all remaining bytes, if any, are handled in the same way
1007             // regardless of sequence's length:
1008             for ( ; len; --len )
1009             {
1010                 c = *++p;
1011                 if ( (c & 0xC0) != 0x80 )
1012                     return wxCONV_FAILED;
1013
1014                 code <<= 6;
1015                 code |= c & 0x3F;
1016             }
1017         }
1018
1019 #ifdef WC_UTF16
1020         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1021         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1022         {
1023             if ( out )
1024                 out++;
1025             written++;
1026         }
1027 #else // !WC_UTF16
1028         if ( out )
1029             *out = code;
1030 #endif // WC_UTF16/!WC_UTF16
1031
1032         if ( out )
1033             out++;
1034
1035         written++;
1036     }
1037
1038     return wxCONV_FAILED;
1039 }
1040
1041 size_t
1042 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1043                               const wchar_t *src, size_t srcLen) const
1044 {
1045     char *out = dstLen ? dst : NULL;
1046     size_t written = 0;
1047
1048     for ( const wchar_t *wp = src; ; wp++ )
1049     {
1050         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1051         {
1052             // all done successfully, just add the trailing NULL if we are not
1053             // using explicit length
1054             if ( srcLen == wxNO_LEN )
1055             {
1056                 if ( out )
1057                 {
1058                     if ( !dstLen )
1059                         break;
1060
1061                     *out = '\0';
1062                 }
1063
1064                 written++;
1065             }
1066
1067             return written;
1068         }
1069
1070         if ( srcLen != wxNO_LEN )
1071             srcLen--;
1072
1073         wxUint32 code;
1074 #ifdef WC_UTF16
1075         // cast is ok for WC_UTF16
1076         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1077         {
1078             // skip the next char too as we decoded a surrogate
1079             wp++;
1080         }
1081 #else // wchar_t is UTF-32
1082         code = *wp & 0x7fffffff;
1083 #endif
1084
1085         unsigned len;
1086         if ( code <= 0x7F )
1087         {
1088             len = 1;
1089             if ( out )
1090             {
1091                 if ( dstLen < len )
1092                     break;
1093
1094                 out[0] = (char)code;
1095             }
1096         }
1097         else if ( code <= 0x07FF )
1098         {
1099             len = 2;
1100             if ( out )
1101             {
1102                 if ( dstLen < len )
1103                     break;
1104
1105                 // NB: this line takes 6 least significant bits, encodes them as
1106                 // 10xxxxxx and discards them so that the next byte can be encoded:
1107                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1108                 out[0] = 0xC0 | code;
1109             }
1110         }
1111         else if ( code < 0xFFFF )
1112         {
1113             len = 3;
1114             if ( out )
1115             {
1116                 if ( dstLen < len )
1117                     break;
1118
1119                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1120                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1121                 out[0] = 0xE0 | code;
1122             }
1123         }
1124         else if ( code <= 0x10FFFF )
1125         {
1126             len = 4;
1127             if ( out )
1128             {
1129                 if ( dstLen < len )
1130                     break;
1131
1132                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1133                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1134                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1135                 out[0] = 0xF0 | code;
1136             }
1137         }
1138         else
1139         {
1140             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1141             break;
1142         }
1143
1144         if ( out )
1145         {
1146             out += len;
1147             dstLen -= len;
1148         }
1149
1150         written += len;
1151     }
1152
1153     // we only get here if an error occurs during decoding
1154     return wxCONV_FAILED;
1155 }
1156
1157 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1158                              const char *psz, size_t srcLen) const
1159 {
1160     if ( m_options == MAP_INVALID_UTF8_NOT )
1161         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1162
1163     size_t len = 0;
1164
1165     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1166     {
1167         const char *opsz = psz;
1168         bool invalid = false;
1169         unsigned char cc = *psz++, fc = cc;
1170         unsigned cnt;
1171         for (cnt = 0; fc & 0x80; cnt++)
1172             fc <<= 1;
1173
1174         if (!cnt)
1175         {
1176             // plain ASCII char
1177             if (buf)
1178                 *buf++ = cc;
1179             len++;
1180
1181             // escape the escape character for octal escapes
1182             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1183                     && cc == '\\' && (!buf || len < n))
1184             {
1185                 if (buf)
1186                     *buf++ = cc;
1187                 len++;
1188             }
1189         }
1190         else
1191         {
1192             cnt--;
1193             if (!cnt)
1194             {
1195                 // invalid UTF-8 sequence
1196                 invalid = true;
1197             }
1198             else
1199             {
1200                 unsigned ocnt = cnt - 1;
1201                 wxUint32 res = cc & (0x3f >> cnt);
1202                 while (cnt--)
1203                 {
1204                     cc = *psz;
1205                     if ((cc & 0xC0) != 0x80)
1206                     {
1207                         // invalid UTF-8 sequence
1208                         invalid = true;
1209                         break;
1210                     }
1211
1212                     psz++;
1213                     res = (res << 6) | (cc & 0x3f);
1214                 }
1215
1216                 if (invalid || res <= utf8_max[ocnt])
1217                 {
1218                     // illegal UTF-8 encoding
1219                     invalid = true;
1220                 }
1221                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1222                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1223                 {
1224                     // if one of our PUA characters turns up externally
1225                     // it must also be treated as an illegal sequence
1226                     // (a bit like you have to escape an escape character)
1227                     invalid = true;
1228                 }
1229                 else
1230                 {
1231 #ifdef WC_UTF16
1232                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1233                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1234                     if (pa == wxCONV_FAILED)
1235                     {
1236                         invalid = true;
1237                     }
1238                     else
1239                     {
1240                         if (buf)
1241                             buf += pa;
1242                         len += pa;
1243                     }
1244 #else // !WC_UTF16
1245                     if (buf)
1246                         *buf++ = (wchar_t)res;
1247                     len++;
1248 #endif // WC_UTF16/!WC_UTF16
1249                 }
1250             }
1251
1252             if (invalid)
1253             {
1254                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1255                 {
1256                     while (opsz < psz && (!buf || len < n))
1257                     {
1258 #ifdef WC_UTF16
1259                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1260                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1261                         wxASSERT(pa != wxCONV_FAILED);
1262                         if (buf)
1263                             buf += pa;
1264                         opsz++;
1265                         len += pa;
1266 #else
1267                         if (buf)
1268                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1269                         opsz++;
1270                         len++;
1271 #endif
1272                     }
1273                 }
1274                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1275                 {
1276                     while (opsz < psz && (!buf || len < n))
1277                     {
1278                         if ( buf && len + 3 < n )
1279                         {
1280                             unsigned char on = *opsz;
1281                             *buf++ = L'\\';
1282                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1283                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1284                             *buf++ = (wchar_t)( L'0' + on % 010 );
1285                         }
1286
1287                         opsz++;
1288                         len += 4;
1289                     }
1290                 }
1291                 else // MAP_INVALID_UTF8_NOT
1292                 {
1293                     return wxCONV_FAILED;
1294                 }
1295             }
1296         }
1297     }
1298
1299     if (srcLen == wxNO_LEN && buf && (len < n))
1300         *buf = 0;
1301
1302     return len + 1;
1303 }
1304
1305 static inline bool isoctal(wchar_t wch)
1306 {
1307     return L'0' <= wch && wch <= L'7';
1308 }
1309
1310 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1311                                const wchar_t *psz, size_t srcLen) const
1312 {
1313     if ( m_options == MAP_INVALID_UTF8_NOT )
1314         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1315
1316     size_t len = 0;
1317
1318     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1319     {
1320         wxUint32 cc;
1321
1322 #ifdef WC_UTF16
1323         // cast is ok for WC_UTF16
1324         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1325         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1326 #else
1327         cc = (*psz++) & 0x7fffffff;
1328 #endif
1329
1330         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1331                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1332         {
1333             if (buf)
1334                 *buf++ = (char)(cc - wxUnicodePUA);
1335             len++;
1336         }
1337         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1338                     && cc == L'\\' && psz[0] == L'\\' )
1339         {
1340             if (buf)
1341                 *buf++ = (char)cc;
1342             psz++;
1343             len++;
1344         }
1345         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1346                     cc == L'\\' &&
1347                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1348         {
1349             if (buf)
1350             {
1351                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1352                                  (psz[1] - L'0') * 010 +
1353                                  (psz[2] - L'0'));
1354             }
1355
1356             psz += 3;
1357             len++;
1358         }
1359         else
1360         {
1361             unsigned cnt;
1362             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1363             {
1364             }
1365
1366             if (!cnt)
1367             {
1368                 // plain ASCII char
1369                 if (buf)
1370                     *buf++ = (char) cc;
1371                 len++;
1372             }
1373             else
1374             {
1375                 len += cnt + 1;
1376                 if (buf)
1377                 {
1378                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1379                     while (cnt--)
1380                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1381                 }
1382             }
1383         }
1384     }
1385
1386     if (srcLen == wxNO_LEN && buf && (len < n))
1387         *buf = 0;
1388
1389     return len + 1;
1390 }
1391
1392 // ============================================================================
1393 // UTF-16
1394 // ============================================================================
1395
1396 #ifdef WORDS_BIGENDIAN
1397     #define wxMBConvUTF16straight wxMBConvUTF16BE
1398     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1399 #else
1400     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1401     #define wxMBConvUTF16straight wxMBConvUTF16LE
1402 #endif
1403
1404 /* static */
1405 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1406 {
1407     if ( srcLen == wxNO_LEN )
1408     {
1409         // count the number of bytes in input, including the trailing NULs
1410         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1411         for ( srcLen = 1; *inBuff++; srcLen++ )
1412             ;
1413
1414         srcLen *= BYTES_PER_CHAR;
1415     }
1416     else // we already have the length
1417     {
1418         // we can only convert an entire number of UTF-16 characters
1419         if ( srcLen % BYTES_PER_CHAR )
1420             return wxCONV_FAILED;
1421     }
1422
1423     return srcLen;
1424 }
1425
1426 // case when in-memory representation is UTF-16 too
1427 #ifdef WC_UTF16
1428
1429 // ----------------------------------------------------------------------------
1430 // conversions without endianness change
1431 // ----------------------------------------------------------------------------
1432
1433 size_t
1434 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1435                                const char *src, size_t srcLen) const
1436 {
1437     // set up the scene for using memcpy() (which is presumably more efficient
1438     // than copying the bytes one by one)
1439     srcLen = GetLength(src, srcLen);
1440     if ( srcLen == wxNO_LEN )
1441         return wxCONV_FAILED;
1442
1443     const size_t inLen = srcLen / BYTES_PER_CHAR;
1444     if ( dst )
1445     {
1446         if ( dstLen < inLen )
1447             return wxCONV_FAILED;
1448
1449         memcpy(dst, src, srcLen);
1450     }
1451
1452     return inLen;
1453 }
1454
1455 size_t
1456 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1457                                  const wchar_t *src, size_t srcLen) const
1458 {
1459     if ( srcLen == wxNO_LEN )
1460         srcLen = wxWcslen(src) + 1;
1461
1462     srcLen *= BYTES_PER_CHAR;
1463
1464     if ( dst )
1465     {
1466         if ( dstLen < srcLen )
1467             return wxCONV_FAILED;
1468
1469         memcpy(dst, src, srcLen);
1470     }
1471
1472     return srcLen;
1473 }
1474
1475 // ----------------------------------------------------------------------------
1476 // endian-reversing conversions
1477 // ----------------------------------------------------------------------------
1478
1479 size_t
1480 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1481                            const char *src, size_t srcLen) const
1482 {
1483     srcLen = GetLength(src, srcLen);
1484     if ( srcLen == wxNO_LEN )
1485         return wxCONV_FAILED;
1486
1487     srcLen /= BYTES_PER_CHAR;
1488
1489     if ( dst )
1490     {
1491         if ( dstLen < srcLen )
1492             return wxCONV_FAILED;
1493
1494         const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1495         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1496         {
1497             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1498         }
1499     }
1500
1501     return srcLen;
1502 }
1503
1504 size_t
1505 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1506                              const wchar_t *src, size_t srcLen) const
1507 {
1508     if ( srcLen == wxNO_LEN )
1509         srcLen = wxWcslen(src) + 1;
1510
1511     srcLen *= BYTES_PER_CHAR;
1512
1513     if ( dst )
1514     {
1515         if ( dstLen < srcLen )
1516             return wxCONV_FAILED;
1517
1518         wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1519         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1520         {
1521             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1522         }
1523     }
1524
1525     return srcLen;
1526 }
1527
1528 #else // !WC_UTF16: wchar_t is UTF-32
1529
1530 // ----------------------------------------------------------------------------
1531 // conversions without endianness change
1532 // ----------------------------------------------------------------------------
1533
1534 size_t
1535 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1536                                const char *src, size_t srcLen) const
1537 {
1538     srcLen = GetLength(src, srcLen);
1539     if ( srcLen == wxNO_LEN )
1540         return wxCONV_FAILED;
1541
1542     const size_t inLen = srcLen / BYTES_PER_CHAR;
1543     if ( !dst )
1544     {
1545         // optimization: return maximal space which could be needed for this
1546         // string even if the real size could be smaller if the buffer contains
1547         // any surrogates
1548         return inLen;
1549     }
1550
1551     size_t outLen = 0;
1552     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1553     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1554     {
1555         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1556         if ( !inBuff )
1557             return wxCONV_FAILED;
1558
1559         if ( ++outLen > dstLen )
1560             return wxCONV_FAILED;
1561
1562         *dst++ = ch;
1563     }
1564
1565
1566     return outLen;
1567 }
1568
1569 size_t
1570 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1571                                  const wchar_t *src, size_t srcLen) const
1572 {
1573     if ( srcLen == wxNO_LEN )
1574         srcLen = wxWcslen(src) + 1;
1575
1576     size_t outLen = 0;
1577     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1578     for ( size_t n = 0; n < srcLen; n++ )
1579     {
1580         wxUint16 cc[2];
1581         const size_t numChars = encode_utf16(*src++, cc);
1582         if ( numChars == wxCONV_FAILED )
1583             return wxCONV_FAILED;
1584
1585         outLen += numChars * BYTES_PER_CHAR;
1586         if ( outBuff )
1587         {
1588             if ( outLen > dstLen )
1589                 return wxCONV_FAILED;
1590
1591             *outBuff++ = cc[0];
1592             if ( numChars == 2 )
1593             {
1594                 // second character of a surrogate
1595                 *outBuff++ = cc[1];
1596             }
1597         }
1598     }
1599
1600     return outLen;
1601 }
1602
1603 // ----------------------------------------------------------------------------
1604 // endian-reversing conversions
1605 // ----------------------------------------------------------------------------
1606
1607 size_t
1608 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1609                            const char *src, size_t srcLen) const
1610 {
1611     srcLen = GetLength(src, srcLen);
1612     if ( srcLen == wxNO_LEN )
1613         return wxCONV_FAILED;
1614
1615     const size_t inLen = srcLen / BYTES_PER_CHAR;
1616     if ( !dst )
1617     {
1618         // optimization: return maximal space which could be needed for this
1619         // string even if the real size could be smaller if the buffer contains
1620         // any surrogates
1621         return inLen;
1622     }
1623
1624     size_t outLen = 0;
1625     const wxUint16 *inBuff = wx_reinterpret_cast(const wxUint16 *, src);
1626     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1627     {
1628         wxUint32 ch;
1629         wxUint16 tmp[2];
1630
1631         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1632         inBuff++;
1633         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1634
1635         const size_t numChars = decode_utf16(tmp, ch);
1636         if ( numChars == wxCONV_FAILED )
1637             return wxCONV_FAILED;
1638
1639         if ( numChars == 2 )
1640             inBuff++;
1641
1642         if ( ++outLen > dstLen )
1643             return wxCONV_FAILED;
1644
1645         *dst++ = ch;
1646     }
1647
1648
1649     return outLen;
1650 }
1651
1652 size_t
1653 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1654                              const wchar_t *src, size_t srcLen) const
1655 {
1656     if ( srcLen == wxNO_LEN )
1657         srcLen = wxWcslen(src) + 1;
1658
1659     size_t outLen = 0;
1660     wxUint16 *outBuff = wx_reinterpret_cast(wxUint16 *, dst);
1661     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1662     {
1663         wxUint16 cc[2];
1664         const size_t numChars = encode_utf16(*src, cc);
1665         if ( numChars == wxCONV_FAILED )
1666             return wxCONV_FAILED;
1667
1668         outLen += numChars * BYTES_PER_CHAR;
1669         if ( outBuff )
1670         {
1671             if ( outLen > dstLen )
1672                 return wxCONV_FAILED;
1673
1674             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1675             if ( numChars == 2 )
1676             {
1677                 // second character of a surrogate
1678                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1679             }
1680         }
1681     }
1682
1683     return outLen;
1684 }
1685
1686 #endif // WC_UTF16/!WC_UTF16
1687
1688
1689 // ============================================================================
1690 // UTF-32
1691 // ============================================================================
1692
1693 #ifdef WORDS_BIGENDIAN
1694     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1695     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1696 #else
1697     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1698     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1699 #endif
1700
1701
1702 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1703 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1704
1705 /* static */
1706 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1707 {
1708     if ( srcLen == wxNO_LEN )
1709     {
1710         // count the number of bytes in input, including the trailing NULs
1711         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1712         for ( srcLen = 1; *inBuff++; srcLen++ )
1713             ;
1714
1715         srcLen *= BYTES_PER_CHAR;
1716     }
1717     else // we already have the length
1718     {
1719         // we can only convert an entire number of UTF-32 characters
1720         if ( srcLen % BYTES_PER_CHAR )
1721             return wxCONV_FAILED;
1722     }
1723
1724     return srcLen;
1725 }
1726
1727 // case when in-memory representation is UTF-16
1728 #ifdef WC_UTF16
1729
1730 // ----------------------------------------------------------------------------
1731 // conversions without endianness change
1732 // ----------------------------------------------------------------------------
1733
1734 size_t
1735 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1736                                const char *src, size_t srcLen) const
1737 {
1738     srcLen = GetLength(src, srcLen);
1739     if ( srcLen == wxNO_LEN )
1740         return wxCONV_FAILED;
1741
1742     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1743     const size_t inLen = srcLen / BYTES_PER_CHAR;
1744     size_t outLen = 0;
1745     for ( size_t n = 0; n < inLen; n++ )
1746     {
1747         wxUint16 cc[2];
1748         const size_t numChars = encode_utf16(*inBuff++, cc);
1749         if ( numChars == wxCONV_FAILED )
1750             return wxCONV_FAILED;
1751
1752         outLen += numChars;
1753         if ( dst )
1754         {
1755             if ( outLen > dstLen )
1756                 return wxCONV_FAILED;
1757
1758             *dst++ = cc[0];
1759             if ( numChars == 2 )
1760             {
1761                 // second character of a surrogate
1762                 *dst++ = cc[1];
1763             }
1764         }
1765     }
1766
1767     return outLen;
1768 }
1769
1770 size_t
1771 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1772                                  const wchar_t *src, size_t srcLen) const
1773 {
1774     if ( srcLen == wxNO_LEN )
1775         srcLen = wxWcslen(src) + 1;
1776
1777     if ( !dst )
1778     {
1779         // optimization: return maximal space which could be needed for this
1780         // string instead of the exact amount which could be less if there are
1781         // any surrogates in the input
1782         //
1783         // we consider that surrogates are rare enough to make it worthwhile to
1784         // avoid running the loop below at the cost of slightly extra memory
1785         // consumption
1786         return srcLen * BYTES_PER_CHAR;
1787     }
1788
1789     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1790     size_t outLen = 0;
1791     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1792     {
1793         const wxUint32 ch = wxDecodeSurrogate(&src);
1794         if ( !src )
1795             return wxCONV_FAILED;
1796
1797         outLen += BYTES_PER_CHAR;
1798
1799         if ( outLen > dstLen )
1800             return wxCONV_FAILED;
1801
1802         *outBuff++ = ch;
1803     }
1804
1805     return outLen;
1806 }
1807
1808 // ----------------------------------------------------------------------------
1809 // endian-reversing conversions
1810 // ----------------------------------------------------------------------------
1811
1812 size_t
1813 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1814                            const char *src, size_t srcLen) const
1815 {
1816     srcLen = GetLength(src, srcLen);
1817     if ( srcLen == wxNO_LEN )
1818         return wxCONV_FAILED;
1819
1820     const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1821     const size_t inLen = srcLen / BYTES_PER_CHAR;
1822     size_t outLen = 0;
1823     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1824     {
1825         wxUint16 cc[2];
1826         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1827         if ( numChars == wxCONV_FAILED )
1828             return wxCONV_FAILED;
1829
1830         outLen += numChars;
1831         if ( dst )
1832         {
1833             if ( outLen > dstLen )
1834                 return wxCONV_FAILED;
1835
1836             *dst++ = cc[0];
1837             if ( numChars == 2 )
1838             {
1839                 // second character of a surrogate
1840                 *dst++ = cc[1];
1841             }
1842         }
1843     }
1844
1845     return outLen;
1846 }
1847
1848 size_t
1849 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1850                              const wchar_t *src, size_t srcLen) const
1851 {
1852     if ( srcLen == wxNO_LEN )
1853         srcLen = wxWcslen(src) + 1;
1854
1855     if ( !dst )
1856     {
1857         // optimization: return maximal space which could be needed for this
1858         // string instead of the exact amount which could be less if there are
1859         // any surrogates in the input
1860         //
1861         // we consider that surrogates are rare enough to make it worthwhile to
1862         // avoid running the loop below at the cost of slightly extra memory
1863         // consumption
1864         return srcLen*BYTES_PER_CHAR;
1865     }
1866
1867     wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1868     size_t outLen = 0;
1869     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1870     {
1871         const wxUint32 ch = wxDecodeSurrogate(&src);
1872         if ( !src )
1873             return wxCONV_FAILED;
1874
1875         outLen += BYTES_PER_CHAR;
1876
1877         if ( outLen > dstLen )
1878             return wxCONV_FAILED;
1879
1880         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1881     }
1882
1883     return outLen;
1884 }
1885
1886 #else // !WC_UTF16: wchar_t is UTF-32
1887
1888 // ----------------------------------------------------------------------------
1889 // conversions without endianness change
1890 // ----------------------------------------------------------------------------
1891
1892 size_t
1893 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1894                                const char *src, size_t srcLen) const
1895 {
1896     // use memcpy() as it should be much faster than hand-written loop
1897     srcLen = GetLength(src, srcLen);
1898     if ( srcLen == wxNO_LEN )
1899         return wxCONV_FAILED;
1900
1901     const size_t inLen = srcLen/BYTES_PER_CHAR;
1902     if ( dst )
1903     {
1904         if ( dstLen < inLen )
1905             return wxCONV_FAILED;
1906
1907         memcpy(dst, src, srcLen);
1908     }
1909
1910     return inLen;
1911 }
1912
1913 size_t
1914 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1915                                  const wchar_t *src, size_t srcLen) const
1916 {
1917     if ( srcLen == wxNO_LEN )
1918         srcLen = wxWcslen(src) + 1;
1919
1920     srcLen *= BYTES_PER_CHAR;
1921
1922     if ( dst )
1923     {
1924         if ( dstLen < srcLen )
1925             return wxCONV_FAILED;
1926
1927         memcpy(dst, src, srcLen);
1928     }
1929
1930     return srcLen;
1931 }
1932
1933 // ----------------------------------------------------------------------------
1934 // endian-reversing conversions
1935 // ----------------------------------------------------------------------------
1936
1937 size_t
1938 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1939                            const char *src, size_t srcLen) const
1940 {
1941     srcLen = GetLength(src, srcLen);
1942     if ( srcLen == wxNO_LEN )
1943         return wxCONV_FAILED;
1944
1945     srcLen /= BYTES_PER_CHAR;
1946
1947     if ( dst )
1948     {
1949         if ( dstLen < srcLen )
1950             return wxCONV_FAILED;
1951
1952         const wxUint32 *inBuff = wx_reinterpret_cast(const wxUint32 *, src);
1953         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1954         {
1955             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1956         }
1957     }
1958
1959     return srcLen;
1960 }
1961
1962 size_t
1963 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1964                              const wchar_t *src, size_t srcLen) const
1965 {
1966     if ( srcLen == wxNO_LEN )
1967         srcLen = wxWcslen(src) + 1;
1968
1969     srcLen *= BYTES_PER_CHAR;
1970
1971     if ( dst )
1972     {
1973         if ( dstLen < srcLen )
1974             return wxCONV_FAILED;
1975
1976         wxUint32 *outBuff = wx_reinterpret_cast(wxUint32 *, dst);
1977         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1978         {
1979             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1980         }
1981     }
1982
1983     return srcLen;
1984 }
1985
1986 #endif // WC_UTF16/!WC_UTF16
1987
1988
1989 // ============================================================================
1990 // The classes doing conversion using the iconv_xxx() functions
1991 // ============================================================================
1992
1993 #ifdef HAVE_ICONV
1994
1995 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1996 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1997 //     (unless there's yet another bug in glibc) the only case when iconv()
1998 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1999 //     left in the input buffer -- when _real_ error occurs,
2000 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2001 //     iconv() failure.
2002 //     [This bug does not appear in glibc 2.2.]
2003 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2004 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2005                                      (errno != E2BIG || bufLeft != 0))
2006 #else
2007 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2008 #endif
2009
2010 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
2011
2012 #define ICONV_T_INVALID ((iconv_t)-1)
2013
2014 #if SIZEOF_WCHAR_T == 4
2015     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2016     #define WC_ENC      wxFONTENCODING_UTF32
2017 #elif SIZEOF_WCHAR_T == 2
2018     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2019     #define WC_ENC      wxFONTENCODING_UTF16
2020 #else // sizeof(wchar_t) != 2 nor 4
2021     // does this ever happen?
2022     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2023 #endif
2024
2025 // ----------------------------------------------------------------------------
2026 // wxMBConv_iconv: encapsulates an iconv character set
2027 // ----------------------------------------------------------------------------
2028
2029 class wxMBConv_iconv : public wxMBConv
2030 {
2031 public:
2032     wxMBConv_iconv(const char *name);
2033     virtual ~wxMBConv_iconv();
2034
2035     // implement base class virtual methods
2036     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2037                            const char *src, size_t srcLen = wxNO_LEN) const;
2038     virtual size_t FromWChar(char *dst, size_t dstLen,
2039                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2040     virtual size_t GetMBNulLen() const;
2041
2042 #if wxUSE_UNICODE_UTF8
2043     virtual bool IsUTF8() const;
2044 #endif
2045
2046     virtual wxMBConv *Clone() const
2047     {
2048         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2049         p->m_minMBCharWidth = m_minMBCharWidth;
2050         return p;
2051     }
2052
2053     bool IsOk() const
2054         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2055
2056 protected:
2057     // the iconv handlers used to translate from multibyte
2058     // to wide char and in the other direction
2059     iconv_t m2w,
2060             w2m;
2061
2062 #if wxUSE_THREADS
2063     // guards access to m2w and w2m objects
2064     wxMutex m_iconvMutex;
2065 #endif
2066
2067 private:
2068     // the name (for iconv_open()) of a wide char charset -- if none is
2069     // available on this machine, it will remain NULL
2070     static wxString ms_wcCharsetName;
2071
2072     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2073     // different endian-ness than the native one
2074     static bool ms_wcNeedsSwap;
2075
2076
2077     // name of the encoding handled by this conversion
2078     wxString m_name;
2079
2080     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2081     // initially
2082     size_t m_minMBCharWidth;
2083 };
2084
2085 // make the constructor available for unit testing
2086 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2087 {
2088     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2089     if ( !result->IsOk() )
2090     {
2091         delete result;
2092         return 0;
2093     }
2094
2095     return result;
2096 }
2097
2098 wxString wxMBConv_iconv::ms_wcCharsetName;
2099 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2100
2101 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2102               : m_name(name)
2103 {
2104     m_minMBCharWidth = 0;
2105
2106     // check for charset that represents wchar_t:
2107     if ( ms_wcCharsetName.empty() )
2108     {
2109         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2110
2111 #if wxUSE_FONTMAP
2112         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2113 #else // !wxUSE_FONTMAP
2114         static const wxChar *names_static[] =
2115         {
2116 #if SIZEOF_WCHAR_T == 4
2117             _T("UCS-4"),
2118 #elif SIZEOF_WCHAR_T = 2
2119             _T("UCS-2"),
2120 #endif
2121             NULL
2122         };
2123         const wxChar **names = names_static;
2124 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2125
2126         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2127         {
2128             const wxString nameCS(*names);
2129
2130             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2131             wxString nameXE(nameCS);
2132
2133 #ifdef WORDS_BIGENDIAN
2134                 nameXE += _T("BE");
2135 #else // little endian
2136                 nameXE += _T("LE");
2137 #endif
2138
2139             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2140                        nameXE.c_str());
2141
2142             m2w = iconv_open(nameXE.ToAscii(), name);
2143             if ( m2w == ICONV_T_INVALID )
2144             {
2145                 // try charset w/o bytesex info (e.g. "UCS4")
2146                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2147                            nameCS.c_str());
2148                 m2w = iconv_open(nameCS.ToAscii(), name);
2149
2150                 // and check for bytesex ourselves:
2151                 if ( m2w != ICONV_T_INVALID )
2152                 {
2153                     char    buf[2], *bufPtr;
2154                     wchar_t wbuf[2];
2155                     size_t  insz, outsz;
2156                     size_t  res;
2157
2158                     buf[0] = 'A';
2159                     buf[1] = 0;
2160                     wbuf[0] = 0;
2161                     insz = 2;
2162                     outsz = SIZEOF_WCHAR_T * 2;
2163                     char* wbufPtr = (char*)wbuf;
2164                     bufPtr = buf;
2165
2166                     res = iconv(
2167                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2168                         &wbufPtr, &outsz);
2169
2170                     if (ICONV_FAILED(res, insz))
2171                     {
2172                         wxLogLastError(wxT("iconv"));
2173                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2174                                    nameCS.c_str());
2175                     }
2176                     else // ok, can convert to this encoding, remember it
2177                     {
2178                         ms_wcCharsetName = nameCS;
2179                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2180                     }
2181                 }
2182             }
2183             else // use charset not requiring byte swapping
2184             {
2185                 ms_wcCharsetName = nameXE;
2186             }
2187         }
2188
2189         wxLogTrace(TRACE_STRCONV,
2190                    wxT("iconv wchar_t charset is \"%s\"%s"),
2191                    ms_wcCharsetName.empty() ? wxString("<none>")
2192                                             : ms_wcCharsetName,
2193                    ms_wcNeedsSwap ? _T(" (needs swap)")
2194                                   : _T(""));
2195     }
2196     else // we already have ms_wcCharsetName
2197     {
2198         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2199     }
2200
2201     if ( ms_wcCharsetName.empty() )
2202     {
2203         w2m = ICONV_T_INVALID;
2204     }
2205     else
2206     {
2207         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2208         if ( w2m == ICONV_T_INVALID )
2209         {
2210             wxLogTrace(TRACE_STRCONV,
2211                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2212                        ms_wcCharsetName.c_str(), name);
2213         }
2214     }
2215 }
2216
2217 wxMBConv_iconv::~wxMBConv_iconv()
2218 {
2219     if ( m2w != ICONV_T_INVALID )
2220         iconv_close(m2w);
2221     if ( w2m != ICONV_T_INVALID )
2222         iconv_close(w2m);
2223 }
2224
2225 size_t
2226 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2227                         const char *src, size_t srcLen) const
2228 {
2229     if ( srcLen == wxNO_LEN )
2230     {
2231         // find the string length: notice that must be done differently for
2232         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2233         // consecutive NULs
2234         const size_t nulLen = GetMBNulLen();
2235         switch ( nulLen )
2236         {
2237             default:
2238                 return wxCONV_FAILED;
2239
2240             case 1:
2241                 srcLen = strlen(src); // arguably more optimized than our version
2242                 break;
2243
2244             case 2:
2245             case 4:
2246                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2247                 // but they also have to start at character boundary and not
2248                 // span two adjacent characters
2249                 const char *p;
2250                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2251                     ;
2252                 srcLen = p - src;
2253                 break;
2254         }
2255
2256         // when we're determining the length of the string ourselves we count
2257         // the terminating NUL(s) as part of it and always NUL-terminate the
2258         // output
2259         srcLen += nulLen;
2260     }
2261
2262     // we express length in the number of (wide) characters but iconv always
2263     // counts buffer sizes it in bytes
2264     dstLen *= SIZEOF_WCHAR_T;
2265
2266 #if wxUSE_THREADS
2267     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2268     //     Unfortunately there are a couple of global wxCSConv objects such as
2269     //     wxConvLocal that are used all over wx code, so we have to make sure
2270     //     the handle is used by at most one thread at the time. Otherwise
2271     //     only a few wx classes would be safe to use from non-main threads
2272     //     as MB<->WC conversion would fail "randomly".
2273     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2274 #endif // wxUSE_THREADS
2275
2276     size_t res, cres;
2277     const char *pszPtr = src;
2278
2279     if ( dst )
2280     {
2281         char* bufPtr = (char*)dst;
2282
2283         // have destination buffer, convert there
2284         size_t dstLenOrig = dstLen;
2285         cres = iconv(m2w,
2286                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2287                      &bufPtr, &dstLen);
2288
2289         // convert the number of bytes converted as returned by iconv to the
2290         // number of (wide) characters converted that we need
2291         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2292
2293         if (ms_wcNeedsSwap)
2294         {
2295             // convert to native endianness
2296             for ( unsigned i = 0; i < res; i++ )
2297                 dst[i] = WC_BSWAP(dst[i]);
2298         }
2299     }
2300     else // no destination buffer
2301     {
2302         // convert using temp buffer to calculate the size of the buffer needed
2303         wchar_t tbuf[8];
2304         res = 0;
2305
2306         do
2307         {
2308             char* bufPtr = (char*)tbuf;
2309             dstLen = 8 * SIZEOF_WCHAR_T;
2310
2311             cres = iconv(m2w,
2312                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2313                          &bufPtr, &dstLen );
2314
2315             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2316         }
2317         while ((cres == (size_t)-1) && (errno == E2BIG));
2318     }
2319
2320     if (ICONV_FAILED(cres, srcLen))
2321     {
2322         //VS: it is ok if iconv fails, hence trace only
2323         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2324         return wxCONV_FAILED;
2325     }
2326
2327     return res;
2328 }
2329
2330 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2331                                  const wchar_t *src, size_t srcLen) const
2332 {
2333 #if wxUSE_THREADS
2334     // NB: explained in MB2WC
2335     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2336 #endif
2337
2338     if ( srcLen == wxNO_LEN )
2339         srcLen = wxWcslen(src) + 1;
2340
2341     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2342     size_t outbuflen = dstLen;
2343     size_t res, cres;
2344
2345     wchar_t *tmpbuf = 0;
2346
2347     if (ms_wcNeedsSwap)
2348     {
2349         // need to copy to temp buffer to switch endianness
2350         // (doing WC_BSWAP twice on the original buffer won't help, as it
2351         //  could be in read-only memory, or be accessed in some other thread)
2352         tmpbuf = (wchar_t *)malloc(inbuflen + SIZEOF_WCHAR_T);
2353         for ( size_t i = 0; i < srcLen; i++ )
2354             tmpbuf[i] = WC_BSWAP(src[i]);
2355
2356         tmpbuf[srcLen] = L'\0';
2357         src = tmpbuf;
2358     }
2359
2360     char* inbuf = (char*)src;
2361     if ( dst )
2362     {
2363         // have destination buffer, convert there
2364         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2365
2366         res = dstLen - outbuflen;
2367     }
2368     else // no destination buffer
2369     {
2370         // convert using temp buffer to calculate the size of the buffer needed
2371         char tbuf[16];
2372         res = 0;
2373         do
2374         {
2375             dst = tbuf;
2376             outbuflen = 16;
2377
2378             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2379
2380             res += 16 - outbuflen;
2381         }
2382         while ((cres == (size_t)-1) && (errno == E2BIG));
2383     }
2384
2385     if (ms_wcNeedsSwap)
2386     {
2387         free(tmpbuf);
2388     }
2389
2390     if (ICONV_FAILED(cres, inbuflen))
2391     {
2392         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2393         return wxCONV_FAILED;
2394     }
2395
2396     return res;
2397 }
2398
2399 size_t wxMBConv_iconv::GetMBNulLen() const
2400 {
2401     if ( m_minMBCharWidth == 0 )
2402     {
2403         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2404
2405 #if wxUSE_THREADS
2406         // NB: explained in MB2WC
2407         wxMutexLocker lock(self->m_iconvMutex);
2408 #endif
2409
2410         const wchar_t *wnul = L"";
2411         char buf[8]; // should be enough for NUL in any encoding
2412         size_t inLen = sizeof(wchar_t),
2413                outLen = WXSIZEOF(buf);
2414         char *inBuff = (char *)wnul;
2415         char *outBuff = buf;
2416         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2417         {
2418             self->m_minMBCharWidth = (size_t)-1;
2419         }
2420         else // ok
2421         {
2422             self->m_minMBCharWidth = outBuff - buf;
2423         }
2424     }
2425
2426     return m_minMBCharWidth;
2427 }
2428
2429 #if wxUSE_UNICODE_UTF8
2430 bool wxMBConv_iconv::IsUTF8() const
2431 {
2432     return wxStricmp(m_name, "UTF-8") == 0 ||
2433            wxStricmp(m_name, "UTF8") == 0;
2434 }
2435 #endif
2436
2437 #endif // HAVE_ICONV
2438
2439
2440 // ============================================================================
2441 // Win32 conversion classes
2442 // ============================================================================
2443
2444 #ifdef wxHAVE_WIN32_MB2WC
2445
2446 // from utils.cpp
2447 #if wxUSE_FONTMAP
2448 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2449 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2450 #endif
2451
2452 class wxMBConv_win32 : public wxMBConv
2453 {
2454 public:
2455     wxMBConv_win32()
2456     {
2457         m_CodePage = CP_ACP;
2458         m_minMBCharWidth = 0;
2459     }
2460
2461     wxMBConv_win32(const wxMBConv_win32& conv)
2462         : wxMBConv()
2463     {
2464         m_CodePage = conv.m_CodePage;
2465         m_minMBCharWidth = conv.m_minMBCharWidth;
2466     }
2467
2468 #if wxUSE_FONTMAP
2469     wxMBConv_win32(const char* name)
2470     {
2471         m_CodePage = wxCharsetToCodepage(name);
2472         m_minMBCharWidth = 0;
2473     }
2474
2475     wxMBConv_win32(wxFontEncoding encoding)
2476     {
2477         m_CodePage = wxEncodingToCodepage(encoding);
2478         m_minMBCharWidth = 0;
2479     }
2480 #endif // wxUSE_FONTMAP
2481
2482     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2483     {
2484         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2485         // the behaviour is not compatible with the Unix version (using iconv)
2486         // and break the library itself, e.g. wxTextInputStream::NextChar()
2487         // wouldn't work if reading an incomplete MB char didn't result in an
2488         // error
2489         //
2490         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2491         // Win XP or newer and it is not supported for UTF-[78] so we always
2492         // use our own conversions in this case. See
2493         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2494         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2495         if ( m_CodePage == CP_UTF8 )
2496         {
2497             return wxMBConvUTF8().MB2WC(buf, psz, n);
2498         }
2499
2500         if ( m_CodePage == CP_UTF7 )
2501         {
2502             return wxMBConvUTF7().MB2WC(buf, psz, n);
2503         }
2504
2505         int flags = 0;
2506         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2507                 IsAtLeastWin2kSP4() )
2508         {
2509             flags = MB_ERR_INVALID_CHARS;
2510         }
2511
2512         const size_t len = ::MultiByteToWideChar
2513                              (
2514                                 m_CodePage,     // code page
2515                                 flags,          // flags: fall on error
2516                                 psz,            // input string
2517                                 -1,             // its length (NUL-terminated)
2518                                 buf,            // output string
2519                                 buf ? n : 0     // size of output buffer
2520                              );
2521         if ( !len )
2522         {
2523             // function totally failed
2524             return wxCONV_FAILED;
2525         }
2526
2527         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2528         // check if we succeeded, by doing a double trip:
2529         if ( !flags && buf )
2530         {
2531             const size_t mbLen = strlen(psz);
2532             wxCharBuffer mbBuf(mbLen);
2533             if ( ::WideCharToMultiByte
2534                    (
2535                       m_CodePage,
2536                       0,
2537                       buf,
2538                       -1,
2539                       mbBuf.data(),
2540                       mbLen + 1,        // size in bytes, not length
2541                       NULL,
2542                       NULL
2543                    ) == 0 ||
2544                   strcmp(mbBuf, psz) != 0 )
2545             {
2546                 // we didn't obtain the same thing we started from, hence
2547                 // the conversion was lossy and we consider that it failed
2548                 return wxCONV_FAILED;
2549             }
2550         }
2551
2552         // note that it returns count of written chars for buf != NULL and size
2553         // of the needed buffer for buf == NULL so in either case the length of
2554         // the string (which never includes the terminating NUL) is one less
2555         return len - 1;
2556     }
2557
2558     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2559     {
2560         /*
2561             we have a problem here: by default, WideCharToMultiByte() may
2562             replace characters unrepresentable in the target code page with bad
2563             quality approximations such as turning "1/2" symbol (U+00BD) into
2564             "1" for the code pages which don't have it and we, obviously, want
2565             to avoid this at any price
2566
2567             the trouble is that this function does it _silently_, i.e. it won't
2568             even tell us whether it did or not... Win98/2000 and higher provide
2569             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2570             we have to resort to a round trip, i.e. check that converting back
2571             results in the same string -- this is, of course, expensive but
2572             otherwise we simply can't be sure to not garble the data.
2573          */
2574
2575         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2576         // it doesn't work with CJK encodings (which we test for rather roughly
2577         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2578         // supporting it
2579         BOOL usedDef wxDUMMY_INITIALIZE(false);
2580         BOOL *pUsedDef;
2581         int flags;
2582         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2583         {
2584             // it's our lucky day
2585             flags = WC_NO_BEST_FIT_CHARS;
2586             pUsedDef = &usedDef;
2587         }
2588         else // old system or unsupported encoding
2589         {
2590             flags = 0;
2591             pUsedDef = NULL;
2592         }
2593
2594         const size_t len = ::WideCharToMultiByte
2595                              (
2596                                 m_CodePage,     // code page
2597                                 flags,          // either none or no best fit
2598                                 pwz,            // input string
2599                                 -1,             // it is (wide) NUL-terminated
2600                                 buf,            // output buffer
2601                                 buf ? n : 0,    // and its size
2602                                 NULL,           // default "replacement" char
2603                                 pUsedDef        // [out] was it used?
2604                              );
2605
2606         if ( !len )
2607         {
2608             // function totally failed
2609             return wxCONV_FAILED;
2610         }
2611
2612         // we did something, check if we really succeeded
2613         if ( flags )
2614         {
2615             // check if the conversion failed, i.e. if any replacements
2616             // were done
2617             if ( usedDef )
2618                 return wxCONV_FAILED;
2619         }
2620         else // we must resort to double tripping...
2621         {
2622             // first we need to ensure that we really have the MB data: this is
2623             // not the case if we're called with NULL buffer, in which case we
2624             // need to do the conversion yet again
2625             wxCharBuffer bufDef;
2626             if ( !buf )
2627             {
2628                 bufDef = wxCharBuffer(len);
2629                 buf = bufDef.data();
2630                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2631                                             buf, len, NULL, NULL) )
2632                     return wxCONV_FAILED;
2633             }
2634
2635             if ( !n )
2636                 n = wcslen(pwz);
2637             wxWCharBuffer wcBuf(n);
2638             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2639                     wcscmp(wcBuf, pwz) != 0 )
2640             {
2641                 // we didn't obtain the same thing we started from, hence
2642                 // the conversion was lossy and we consider that it failed
2643                 return wxCONV_FAILED;
2644             }
2645         }
2646
2647         // see the comment above for the reason of "len - 1"
2648         return len - 1;
2649     }
2650
2651     virtual size_t GetMBNulLen() const
2652     {
2653         if ( m_minMBCharWidth == 0 )
2654         {
2655             int len = ::WideCharToMultiByte
2656                         (
2657                             m_CodePage,     // code page
2658                             0,              // no flags
2659                             L"",            // input string
2660                             1,              // translate just the NUL
2661                             NULL,           // output buffer
2662                             0,              // and its size
2663                             NULL,           // no replacement char
2664                             NULL            // [out] don't care if it was used
2665                         );
2666
2667             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2668             switch ( len )
2669             {
2670                 default:
2671                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2672                     self->m_minMBCharWidth = (size_t)-1;
2673                     break;
2674
2675                 case 0:
2676                     self->m_minMBCharWidth = (size_t)-1;
2677                     break;
2678
2679                 case 1:
2680                 case 2:
2681                 case 4:
2682                     self->m_minMBCharWidth = len;
2683                     break;
2684             }
2685         }
2686
2687         return m_minMBCharWidth;
2688     }
2689
2690     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2691
2692     bool IsOk() const { return m_CodePage != -1; }
2693
2694 private:
2695     static bool CanUseNoBestFit()
2696     {
2697         static int s_isWin98Or2k = -1;
2698
2699         if ( s_isWin98Or2k == -1 )
2700         {
2701             int verMaj, verMin;
2702             switch ( wxGetOsVersion(&verMaj, &verMin) )
2703             {
2704                 case wxOS_WINDOWS_9X:
2705                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2706                     break;
2707
2708                 case wxOS_WINDOWS_NT:
2709                     s_isWin98Or2k = verMaj >= 5;
2710                     break;
2711
2712                 default:
2713                     // unknown: be conservative by default
2714                     s_isWin98Or2k = 0;
2715                     break;
2716             }
2717
2718             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2719         }
2720
2721         return s_isWin98Or2k == 1;
2722     }
2723
2724     static bool IsAtLeastWin2kSP4()
2725     {
2726 #ifdef __WXWINCE__
2727         return false;
2728 #else
2729         static int s_isAtLeastWin2kSP4 = -1;
2730
2731         if ( s_isAtLeastWin2kSP4 == -1 )
2732         {
2733             OSVERSIONINFOEX ver;
2734
2735             memset(&ver, 0, sizeof(ver));
2736             ver.dwOSVersionInfoSize = sizeof(ver);
2737             GetVersionEx((OSVERSIONINFO*)&ver);
2738
2739             s_isAtLeastWin2kSP4 =
2740               ((ver.dwMajorVersion > 5) || // Vista+
2741                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2742                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2743                ver.wServicePackMajor >= 4)) // 2000 SP4+
2744               ? 1 : 0;
2745         }
2746
2747         return s_isAtLeastWin2kSP4 == 1;
2748 #endif
2749     }
2750
2751
2752     // the code page we're working with
2753     long m_CodePage;
2754
2755     // cached result of GetMBNulLen(), set to 0 initially meaning
2756     // "unknown"
2757     size_t m_minMBCharWidth;
2758 };
2759
2760 #endif // wxHAVE_WIN32_MB2WC
2761
2762
2763 // ============================================================================
2764 // wxEncodingConverter based conversion classes
2765 // ============================================================================
2766
2767 #if wxUSE_FONTMAP
2768
2769 class wxMBConv_wxwin : public wxMBConv
2770 {
2771 private:
2772     void Init()
2773     {
2774         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2775         // The wxMBConv_cf class does a better job.
2776         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2777                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2778                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2779     }
2780
2781 public:
2782     // temporarily just use wxEncodingConverter stuff,
2783     // so that it works while a better implementation is built
2784     wxMBConv_wxwin(const char* name)
2785     {
2786         if (name)
2787             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2788         else
2789             m_enc = wxFONTENCODING_SYSTEM;
2790
2791         Init();
2792     }
2793
2794     wxMBConv_wxwin(wxFontEncoding enc)
2795     {
2796         m_enc = enc;
2797
2798         Init();
2799     }
2800
2801     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2802     {
2803         size_t inbuf = strlen(psz);
2804         if (buf)
2805         {
2806             if (!m2w.Convert(psz, buf))
2807                 return wxCONV_FAILED;
2808         }
2809         return inbuf;
2810     }
2811
2812     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2813     {
2814         const size_t inbuf = wxWcslen(psz);
2815         if (buf)
2816         {
2817             if (!w2m.Convert(psz, buf))
2818                 return wxCONV_FAILED;
2819         }
2820
2821         return inbuf;
2822     }
2823
2824     virtual size_t GetMBNulLen() const
2825     {
2826         switch ( m_enc )
2827         {
2828             case wxFONTENCODING_UTF16BE:
2829             case wxFONTENCODING_UTF16LE:
2830                 return 2;
2831
2832             case wxFONTENCODING_UTF32BE:
2833             case wxFONTENCODING_UTF32LE:
2834                 return 4;
2835
2836             default:
2837                 return 1;
2838         }
2839     }
2840
2841     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2842
2843     bool IsOk() const { return m_ok; }
2844
2845 public:
2846     wxFontEncoding m_enc;
2847     wxEncodingConverter m2w, w2m;
2848
2849 private:
2850     // were we initialized successfully?
2851     bool m_ok;
2852
2853     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2854 };
2855
2856 // make the constructors available for unit testing
2857 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2858 {
2859     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2860     if ( !result->IsOk() )
2861     {
2862         delete result;
2863         return 0;
2864     }
2865
2866     return result;
2867 }
2868
2869 #endif // wxUSE_FONTMAP
2870
2871 // ============================================================================
2872 // wxCSConv implementation
2873 // ============================================================================
2874
2875 void wxCSConv::Init()
2876 {
2877     m_name = NULL;
2878     m_convReal =  NULL;
2879     m_deferred = true;
2880 }
2881
2882 wxCSConv::wxCSConv(const wxString& charset)
2883 {
2884     Init();
2885
2886     if ( !charset.empty() )
2887     {
2888         SetName(charset.ToAscii());
2889     }
2890
2891 #if wxUSE_FONTMAP
2892     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2893     if ( m_encoding == wxFONTENCODING_MAX )
2894     {
2895         // set to unknown/invalid value
2896         m_encoding = wxFONTENCODING_SYSTEM;
2897     }
2898     else if ( m_encoding == wxFONTENCODING_DEFAULT )
2899     {
2900         // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2901         m_encoding = wxFONTENCODING_ISO8859_1;
2902     }
2903 #else
2904     m_encoding = wxFONTENCODING_SYSTEM;
2905 #endif
2906 }
2907
2908 wxCSConv::wxCSConv(wxFontEncoding encoding)
2909 {
2910     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2911     {
2912         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2913
2914         encoding = wxFONTENCODING_SYSTEM;
2915     }
2916
2917     Init();
2918
2919     m_encoding = encoding;
2920 }
2921
2922 wxCSConv::~wxCSConv()
2923 {
2924     Clear();
2925 }
2926
2927 wxCSConv::wxCSConv(const wxCSConv& conv)
2928         : wxMBConv()
2929 {
2930     Init();
2931
2932     SetName(conv.m_name);
2933     m_encoding = conv.m_encoding;
2934 }
2935
2936 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2937 {
2938     Clear();
2939
2940     SetName(conv.m_name);
2941     m_encoding = conv.m_encoding;
2942
2943     return *this;
2944 }
2945
2946 void wxCSConv::Clear()
2947 {
2948     free(m_name);
2949     delete m_convReal;
2950
2951     m_name = NULL;
2952     m_convReal = NULL;
2953 }
2954
2955 void wxCSConv::SetName(const char *charset)
2956 {
2957     if (charset)
2958     {
2959         m_name = wxStrdup(charset);
2960         m_deferred = true;
2961     }
2962 }
2963
2964 #if wxUSE_FONTMAP
2965
2966 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2967                      wxEncodingNameCache );
2968
2969 static wxEncodingNameCache gs_nameCache;
2970 #endif
2971
2972 wxMBConv *wxCSConv::DoCreate() const
2973 {
2974 #if wxUSE_FONTMAP
2975     wxLogTrace(TRACE_STRCONV,
2976                wxT("creating conversion for %s"),
2977                (m_name ? m_name
2978                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2979 #endif // wxUSE_FONTMAP
2980
2981     // check for the special case of ASCII or ISO8859-1 charset: as we have
2982     // special knowledge of it anyhow, we don't need to create a special
2983     // conversion object
2984     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2985             m_encoding == wxFONTENCODING_DEFAULT )
2986     {
2987         // don't convert at all
2988         return NULL;
2989     }
2990
2991     // we trust OS to do conversion better than we can so try external
2992     // conversion methods first
2993     //
2994     // the full order is:
2995     //      1. OS conversion (iconv() under Unix or Win32 API)
2996     //      2. hard coded conversions for UTF
2997     //      3. wxEncodingConverter as fall back
2998
2999     // step (1)
3000 #ifdef HAVE_ICONV
3001 #if !wxUSE_FONTMAP
3002     if ( m_name )
3003 #endif // !wxUSE_FONTMAP
3004     {
3005 #if wxUSE_FONTMAP
3006         wxFontEncoding encoding(m_encoding);
3007 #endif
3008
3009         if ( m_name )
3010         {
3011             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3012             if ( conv->IsOk() )
3013                 return conv;
3014
3015             delete conv;
3016
3017 #if wxUSE_FONTMAP
3018             encoding =
3019                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3020 #endif // wxUSE_FONTMAP
3021         }
3022 #if wxUSE_FONTMAP
3023         {
3024             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3025             if ( it != gs_nameCache.end() )
3026             {
3027                 if ( it->second.empty() )
3028                     return NULL;
3029
3030                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3031                 if ( conv->IsOk() )
3032                     return conv;
3033
3034                 delete conv;
3035             }
3036
3037             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3038             // CS : in case this does not return valid names (eg for MacRoman)
3039             // encoding got a 'failure' entry in the cache all the same,
3040             // although it just has to be created using a different method, so
3041             // only store failed iconv creation attempts (or perhaps we
3042             // shoulnd't do this at all ?)
3043             if ( names[0] != NULL )
3044             {
3045                 for ( ; *names; ++names )
3046                 {
3047                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3048                     //             will need changes that will obsolete this
3049                     wxString name(*names);
3050                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3051                     if ( conv->IsOk() )
3052                     {
3053                         gs_nameCache[encoding] = *names;
3054                         return conv;
3055                     }
3056
3057                     delete conv;
3058                 }
3059
3060                 gs_nameCache[encoding] = _T(""); // cache the failure
3061             }
3062         }
3063 #endif // wxUSE_FONTMAP
3064     }
3065 #endif // HAVE_ICONV
3066
3067 #ifdef wxHAVE_WIN32_MB2WC
3068     {
3069 #if wxUSE_FONTMAP
3070         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3071                                       : new wxMBConv_win32(m_encoding);
3072         if ( conv->IsOk() )
3073             return conv;
3074
3075         delete conv;
3076 #else
3077         return NULL;
3078 #endif
3079     }
3080 #endif // wxHAVE_WIN32_MB2WC
3081
3082 #ifdef __DARWIN__
3083     {
3084         // leave UTF16 and UTF32 to the built-ins of wx
3085         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3086             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3087         {
3088 #if wxUSE_FONTMAP
3089             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3090                                           : new wxMBConv_cf(m_encoding);
3091 #else
3092             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3093 #endif
3094
3095             if ( conv->IsOk() )
3096                  return conv;
3097
3098             delete conv;
3099         }
3100     }
3101 #endif // __DARWIN__
3102
3103     // step (2)
3104     wxFontEncoding enc = m_encoding;
3105 #if wxUSE_FONTMAP
3106     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3107     {
3108         // use "false" to suppress interactive dialogs -- we can be called from
3109         // anywhere and popping up a dialog from here is the last thing we want to
3110         // do
3111         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3112     }
3113 #endif // wxUSE_FONTMAP
3114
3115     switch ( enc )
3116     {
3117         case wxFONTENCODING_UTF7:
3118              return new wxMBConvUTF7;
3119
3120         case wxFONTENCODING_UTF8:
3121              return new wxMBConvUTF8;
3122
3123         case wxFONTENCODING_UTF16BE:
3124              return new wxMBConvUTF16BE;
3125
3126         case wxFONTENCODING_UTF16LE:
3127              return new wxMBConvUTF16LE;
3128
3129         case wxFONTENCODING_UTF32BE:
3130              return new wxMBConvUTF32BE;
3131
3132         case wxFONTENCODING_UTF32LE:
3133              return new wxMBConvUTF32LE;
3134
3135         default:
3136              // nothing to do but put here to suppress gcc warnings
3137              break;
3138     }
3139
3140     // step (3)
3141 #if wxUSE_FONTMAP
3142     {
3143         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3144                                       : new wxMBConv_wxwin(m_encoding);
3145         if ( conv->IsOk() )
3146             return conv;
3147
3148         delete conv;
3149     }
3150 #endif // wxUSE_FONTMAP
3151
3152     // NB: This is a hack to prevent deadlock. What could otherwise happen
3153     //     in Unicode build: wxConvLocal creation ends up being here
3154     //     because of some failure and logs the error. But wxLog will try to
3155     //     attach a timestamp, for which it will need wxConvLocal (to convert
3156     //     time to char* and then wchar_t*), but that fails, tries to log the
3157     //     error, but wxLog has an (already locked) critical section that
3158     //     guards the static buffer.
3159     static bool alreadyLoggingError = false;
3160     if (!alreadyLoggingError)
3161     {
3162         alreadyLoggingError = true;
3163         wxLogError(_("Cannot convert from the charset '%s'!"),
3164                    m_name ? m_name
3165                       :
3166 #if wxUSE_FONTMAP
3167                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3168 #else // !wxUSE_FONTMAP
3169                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3170 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3171               );
3172
3173         alreadyLoggingError = false;
3174     }
3175
3176     return NULL;
3177 }
3178
3179 void wxCSConv::CreateConvIfNeeded() const
3180 {
3181     if ( m_deferred )
3182     {
3183         wxCSConv *self = (wxCSConv *)this; // const_cast
3184
3185         // if we don't have neither the name nor the encoding, use the default
3186         // encoding for this system
3187         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3188         {
3189 #if wxUSE_INTL
3190             self->m_encoding = wxLocale::GetSystemEncoding();
3191 #else
3192             // fallback to some reasonable default:
3193             self->m_encoding = wxFONTENCODING_ISO8859_1;
3194 #endif // wxUSE_INTL
3195         }
3196
3197         self->m_convReal = DoCreate();
3198         self->m_deferred = false;
3199     }
3200 }
3201
3202 bool wxCSConv::IsOk() const
3203 {
3204     CreateConvIfNeeded();
3205
3206     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3207     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3208         return true; // always ok as we do it ourselves
3209
3210     // m_convReal->IsOk() is called at its own creation, so we know it must
3211     // be ok if m_convReal is non-NULL
3212     return m_convReal != NULL;
3213 }
3214
3215 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3216                          const char *src, size_t srcLen) const
3217 {
3218     CreateConvIfNeeded();
3219
3220     if (m_convReal)
3221         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3222
3223     // latin-1 (direct)
3224     if ( srcLen == wxNO_LEN )
3225         srcLen = strlen(src) + 1; // take trailing NUL too
3226
3227     if ( dst )
3228     {
3229         if ( dstLen < srcLen )
3230             return wxCONV_FAILED;
3231
3232         for ( size_t n = 0; n < srcLen; n++ )
3233             dst[n] = (unsigned char)(src[n]);
3234     }
3235
3236     return srcLen;
3237 }
3238
3239 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3240                            const wchar_t *src, size_t srcLen) const
3241 {
3242     CreateConvIfNeeded();
3243
3244     if (m_convReal)
3245         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3246
3247     // latin-1 (direct)
3248     if ( srcLen == wxNO_LEN )
3249         srcLen = wxWcslen(src) + 1;
3250
3251     if ( dst )
3252     {
3253         if ( dstLen < srcLen )
3254             return wxCONV_FAILED;
3255
3256         for ( size_t n = 0; n < srcLen; n++ )
3257         {
3258             if ( src[n] > 0xFF )
3259                 return wxCONV_FAILED;
3260
3261             dst[n] = (char)src[n];
3262         }
3263
3264     }
3265     else // still need to check the input validity
3266     {
3267         for ( size_t n = 0; n < srcLen; n++ )
3268         {
3269             if ( src[n] > 0xFF )
3270                 return wxCONV_FAILED;
3271         }
3272     }
3273
3274     return srcLen;
3275 }
3276
3277 size_t wxCSConv::GetMBNulLen() const
3278 {
3279     CreateConvIfNeeded();
3280
3281     if ( m_convReal )
3282     {
3283         return m_convReal->GetMBNulLen();
3284     }
3285
3286     // otherwise, we are ISO-8859-1
3287     return 1;
3288 }
3289
3290 #if wxUSE_UNICODE_UTF8
3291 bool wxCSConv::IsUTF8() const
3292 {
3293     CreateConvIfNeeded();
3294
3295     if ( m_convReal )
3296     {
3297         return m_convReal->IsUTF8();
3298     }
3299
3300     // otherwise, we are ISO-8859-1
3301     return false;
3302 }
3303 #endif
3304
3305
3306 #if wxUSE_UNICODE
3307
3308 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3309 {
3310     if ( !s )
3311         return wxWCharBuffer();
3312
3313     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3314     if ( !wbuf )
3315         wbuf = wxMBConvUTF8().cMB2WX(s);
3316     if ( !wbuf )
3317         wbuf = wxConvISO8859_1.cMB2WX(s);
3318
3319     return wbuf;
3320 }
3321
3322 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3323 {
3324     if ( !ws )
3325         return wxCharBuffer();
3326
3327     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3328     if ( !buf )
3329         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3330
3331     return buf;
3332 }
3333
3334 #endif // wxUSE_UNICODE
3335
3336 // ----------------------------------------------------------------------------
3337 // globals
3338 // ----------------------------------------------------------------------------
3339
3340 // NB: The reason why we create converted objects in this convoluted way,
3341 //     using a factory function instead of global variable, is that they
3342 //     may be used at static initialization time (some of them are used by
3343 //     wxString ctors and there may be a global wxString object). In other
3344 //     words, possibly _before_ the converter global object would be
3345 //     initialized.
3346
3347 #undef wxConvLibc
3348 #undef wxConvUTF8
3349 #undef wxConvUTF7
3350 #undef wxConvLocal
3351 #undef wxConvISO8859_1
3352
3353 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3354     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3355     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3356     {                                                                   \
3357         static impl_klass name##Obj ctor_args;                          \
3358         return &name##Obj;                                              \
3359     }                                                                   \
3360     /* this ensures that all global converter objects are created */    \
3361     /* by the time static initialization is done, i.e. before any */    \
3362     /* thread is launched: */                                           \
3363     static klass* gs_##name##instance = wxGet_##name##Ptr()
3364
3365 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3366     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3367
3368 #ifdef __WINDOWS__
3369     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3370 #else
3371     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3372 #endif
3373
3374 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3375 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3376 //     provokes an error message about "not enough macro parameters"; and we
3377 //     can't use "()" here as the name##Obj declaration would be parsed as a
3378 //     function declaration then, so use a semicolon and live with an extra
3379 //     empty statement (and hope that no compilers warns about this)
3380 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3381 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3382
3383 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3384 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3385
3386 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3387 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3388
3389 #ifdef __DARWIN__
3390 // The xnu kernel always communicates file paths in decomposed UTF-8.
3391 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3392 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3393 #endif
3394
3395 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3396 #ifdef __DARWIN__
3397                                     &wxConvMacUTF8DObj;
3398 #else // !__DARWIN__
3399                                     wxGet_wxConvLibcPtr();
3400 #endif // __DARWIN__/!__DARWIN__
3401
3402 #else // !wxUSE_WCHAR_T
3403
3404 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3405 // stand-ins in absence of wchar_t
3406 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3407                                 wxConvISO8859_1,
3408                                 wxConvLocal,
3409                                 wxConvUTF8;
3410
3411 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T