src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        src/common/strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // For compilers that support precompilation, includes "wx.h".
  16 #include "wx/wxprec.h"
  17
  18 #ifdef __BORLANDC__
  19     #pragma hdrstop
  20 #endif  //__BORLANDC__
  21
  22 #ifndef WX_PRECOMP
  23     #include "wx/intl.h"
  24     #include "wx/log.h"
  25     #include "wx/utils.h"
  26     #include "wx/hashmap.h"
  27 #endif
  28
  29 #include "wx/strconv.h"
  30
  31 #if wxUSE_WCHAR_T
  32
  33 #ifndef __WXWINCE__
  34 #include <errno.h>
  35 #endif
  36
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <stdlib.h>
  40
  41 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  42     #include "wx/msw/private.h"
  43     #include "wx/msw/missing.h"
  44     #define wxHAVE_WIN32_MB2WC
  45 #endif
  46
  47 #ifdef HAVE_ICONV
  48     #include <iconv.h>
  49     #include "wx/thread.h"
  50 #endif
  51
  52 #include "wx/encconv.h"
  53 #include "wx/fontmap.h"
  54
  55 #ifdef __DARWIN__
  56 #include "wx/osx/core/private/strconv_cf.h"
  57 #endif //def __DARWIN__
  58
  59
  60 #define TRACE_STRCONV _T("strconv")
  61
  62 // WC_UTF16 is defined only if sizeof(wchar_t) == 2, otherwise it's supposed to
  63 // be 4 bytes
  64 #if SIZEOF_WCHAR_T == 2
  65     #define WC_UTF16
  66 #endif
  67
  68
  69 // ============================================================================
  70 // implementation
  71 // ============================================================================
  72
  73 // helper function of cMB2WC(): check if n bytes at this location are all NUL
  74 static bool NotAllNULs(const char *p, size_t n)
  75 {
  76     while ( n && *p++ == '\0' )
  77         n--;
  78
  79     return n != 0;
  80 }
  81
  82 // ----------------------------------------------------------------------------
  83 // UTF-16 en/decoding to/from UCS-4 with surrogates handling
  84 // ----------------------------------------------------------------------------
  85
  86 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
  87 {
  88     if (input <= 0xffff)
  89     {
  90         if (output)
  91             *output = (wxUint16) input;
  92
  93         return 1;
  94     }
  95     else if (input >= 0x110000)
  96     {
  97         return wxCONV_FAILED;
  98     }
  99     else
 100     {
 101         if (output)
 102         {
 103             *output++ = (wxUint16) ((input >> 10) + 0xd7c0);
 104             *output = (wxUint16) ((input & 0x3ff) + 0xdc00);
 105         }
 106
 107         return 2;
 108     }
 109 }
 110
 111 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 112 {
 113     if ((*input < 0xd800) || (*input > 0xdfff))
 114     {
 115         output = *input;
 116         return 1;
 117     }
 118     else if ((input[1] < 0xdc00) || (input[1] > 0xdfff))
 119     {
 120         output = *input;
 121         return wxCONV_FAILED;
 122     }
 123     else
 124     {
 125         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 126         return 2;
 127     }
 128 }
 129
 130 #ifdef WC_UTF16
 131     typedef wchar_t wxDecodeSurrogate_t;
 132 #else // !WC_UTF16
 133     typedef wxUint16 wxDecodeSurrogate_t;
 134 #endif // WC_UTF16/!WC_UTF16
 135
 136 // returns the next UTF-32 character from the wchar_t buffer and advances the
 137 // pointer to the character after this one
 138 //
 139 // if an invalid character is found, *pSrc is set to NULL, the caller must
 140 // check for this
 141 static wxUint32 wxDecodeSurrogate(const wxDecodeSurrogate_t **pSrc)
 142 {
 143     wxUint32 out;
 144     const size_t
 145         n = decode_utf16(reinterpret_cast<const wxUint16 *>(*pSrc), out);
 146     if ( n == wxCONV_FAILED )
 147         *pSrc = NULL;
 148     else
 149         *pSrc += n;
 150
 151     return out;
 152 }
 153
 154 // ----------------------------------------------------------------------------
 155 // wxMBConv
 156 // ----------------------------------------------------------------------------
 157
 158 size_t
 159 wxMBConv::ToWChar(wchar_t *dst, size_t dstLen,
 160                   const char *src, size_t srcLen) const
 161 {
 162     // although new conversion classes are supposed to implement this function
 163     // directly, the existing ones only implement the old MB2WC() and so, to
 164     // avoid to have to rewrite all conversion classes at once, we provide a
 165     // default (but not efficient) implementation of this one in terms of the
 166     // old function by copying the input to ensure that it's NUL-terminated and
 167     // then using MB2WC() to convert it
 168     //
 169     // moreover, some conversion classes simply can't implement ToWChar()
 170     // directly, the primary example is wxConvLibc: mbstowcs() only handles
 171     // NUL-terminated strings
 172
 173     // the number of chars [which would be] written to dst [if it were not NULL]
 174     size_t dstWritten = 0;
 175
 176     // the number of NULs terminating this string
 177     size_t nulLen = 0;  // not really needed, but just to avoid warnings
 178
 179     // if we were not given the input size we just have to assume that the
 180     // string is properly terminated as we have no way of knowing how long it
 181     // is anyhow, but if we do have the size check whether there are enough
 182     // NULs at the end
 183     wxCharBuffer bufTmp;
 184     const char *srcEnd;
 185     if ( srcLen != wxNO_LEN )
 186     {
 187         // we need to know how to find the end of this string
 188         nulLen = GetMBNulLen();
 189         if ( nulLen == wxCONV_FAILED )
 190             return wxCONV_FAILED;
 191
 192         // if there are enough NULs we can avoid the copy
 193         if ( srcLen < nulLen || NotAllNULs(src + srcLen - nulLen, nulLen) )
 194         {
 195             // make a copy in order to properly NUL-terminate the string
 196             bufTmp = wxCharBuffer(srcLen + nulLen - 1 /* 1 will be added */);
 197             char * const p = bufTmp.data();
 198             memcpy(p, src, srcLen);
 199             for ( char *s = p + srcLen; s < p + srcLen + nulLen; s++ )
 200                 *s = '\0';
 201
 202             src = bufTmp;
 203         }
 204
 205         srcEnd = src + srcLen;
 206     }
 207     else // quit after the first loop iteration
 208     {
 209         srcEnd = NULL;
 210     }
 211
 212     // the idea of this code is straightforward: it converts a NUL-terminated
 213     // chunk of the string during each iteration and updates the output buffer
 214     // with the result
 215     //
 216     // all the complication come from the fact that this function, for
 217     // historical reasons, must behave in 2 subtly different ways when it's
 218     // called with a fixed number of characters and when it's called for the
 219     // entire NUL-terminated string: in the former case (srcEnd == NULL) we
 220     // must count all characters we convert, NUL or not; but in the latter we
 221     // do not count the trailing NUL -- but still count all the NULs inside the
 222     // string
 223     //
 224     // so for the (simple) former case we just always count the trailing NUL,
 225     // but for the latter we need to wait until we see if there is going to be
 226     // another loop iteration and only count it then
 227     for ( ;; )
 228     {
 229         // try to convert the current chunk
 230         size_t lenChunk = MB2WC(NULL, src, 0);
 231         if ( lenChunk == wxCONV_FAILED )
 232             return wxCONV_FAILED;
 233
 234         dstWritten += lenChunk;
 235         if ( !srcEnd )
 236             dstWritten++;
 237
 238         if ( !lenChunk )
 239         {
 240             // nothing left in the input string, conversion succeeded
 241             break;
 242         }
 243
 244         if ( dst )
 245         {
 246             if ( dstWritten > dstLen )
 247                 return wxCONV_FAILED;
 248
 249             // +1 is for trailing NUL
 250             if ( MB2WC(dst, src, lenChunk + 1) == wxCONV_FAILED )
 251                 return wxCONV_FAILED;
 252
 253             dst += lenChunk;
 254             if ( !srcEnd )
 255                 dst++;
 256         }
 257
 258         if ( !srcEnd )
 259         {
 260             // we convert just one chunk in this case as this is the entire
 261             // string anyhow
 262             break;
 263         }
 264
 265         // advance the input pointer past the end of this chunk
 266         while ( NotAllNULs(src, nulLen) )
 267         {
 268             // notice that we must skip over multiple bytes here as we suppose
 269             // that if NUL takes 2 or 4 bytes, then all the other characters do
 270             // too and so if advanced by a single byte we might erroneously
 271             // detect sequences of NUL bytes in the middle of the input
 272             src += nulLen;
 273         }
 274
 275         src += nulLen; // skipping over its terminator as well
 276
 277         // note that ">=" (and not just "==") is needed here as the terminator
 278         // we skipped just above could be inside or just after the buffer
 279         // delimited by srcEnd
 280         if ( src >= srcEnd )
 281             break;
 282
 283         // if we got here then this wasn't the last chunk in this string and
 284         // hence we must count an extra char for L'\0' even when converting a
 285         // fixed number of characters
 286         if ( srcEnd )
 287         {
 288             dstWritten++;
 289             if ( dst )
 290                 dst++;
 291         }
 292     }
 293
 294     return dstWritten;
 295 }
 296
 297 size_t
 298 wxMBConv::FromWChar(char *dst, size_t dstLen,
 299                     const wchar_t *src, size_t srcLen) const
 300 {
 301     // the number of chars [which would be] written to dst [if it were not NULL]
 302     size_t dstWritten = 0;
 303
 304     // if we don't know its length we have no choice but to assume that it is
 305     // NUL-terminated (notice that it can still be NUL-terminated even if
 306     // explicit length is given but it doesn't change our return value)
 307     const bool isNulTerminated = srcLen == wxNO_LEN;
 308
 309     // make a copy of the input string unless it is already properly
 310     // NUL-terminated
 311     wxWCharBuffer bufTmp;
 312     if ( isNulTerminated )
 313     {
 314         srcLen = wxWcslen(src) + 1;
 315     }
 316     else if ( srcLen != 0 && src[srcLen - 1] != L'\0' )
 317     {
 318         // make a copy in order to properly NUL-terminate the string
 319         bufTmp = wxWCharBuffer(srcLen);
 320         memcpy(bufTmp.data(), src, srcLen * sizeof(wchar_t));
 321         src = bufTmp;
 322     }
 323
 324     const size_t lenNul = GetMBNulLen();
 325     for ( const wchar_t * const srcEnd = src + srcLen;
 326           src < srcEnd;
 327           src += wxWcslen(src) + 1 /* skip L'\0' too */ )
 328     {
 329         // try to convert the current chunk
 330         size_t lenChunk = WC2MB(NULL, src, 0);
 331
 332         if ( lenChunk == wxCONV_FAILED )
 333             return wxCONV_FAILED;
 334
 335         dstWritten += lenChunk;
 336         if ( src+lenChunk < srcEnd || isNulTerminated )
 337             dstWritten += lenNul;
 338
 339         if ( dst )
 340         {
 341             if ( dstWritten > dstLen )
 342                 return wxCONV_FAILED;
 343
 344             if ( WC2MB(dst, src, lenChunk + lenNul) == wxCONV_FAILED )
 345                 return wxCONV_FAILED;
 346
 347             dst += lenChunk;
 348             if ( src+lenChunk < srcEnd || isNulTerminated )
 349                 dst += lenNul;
 350         }
 351     }
 352
 353     return dstWritten;
 354 }
 355
 356 size_t wxMBConv::MB2WC(wchar_t *outBuff, const char *inBuff, size_t outLen) const
 357 {
 358     size_t rc = ToWChar(outBuff, outLen, inBuff);
 359     if ( rc != wxCONV_FAILED )
 360     {
 361         // ToWChar() returns the buffer length, i.e. including the trailing
 362         // NUL, while this method doesn't take it into account
 363         rc--;
 364     }
 365
 366     return rc;
 367 }
 368
 369 size_t wxMBConv::WC2MB(char *outBuff, const wchar_t *inBuff, size_t outLen) const
 370 {
 371     size_t rc = FromWChar(outBuff, outLen, inBuff);
 372     if ( rc != wxCONV_FAILED )
 373     {
 374         rc -= GetMBNulLen();
 375     }
 376
 377     return rc;
 378 }
 379
 380 wxMBConv::~wxMBConv()
 381 {
 382     // nothing to do here (necessary for Darwin linking probably)
 383 }
 384
 385 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 386 {
 387     if ( psz )
 388     {
 389         // calculate the length of the buffer needed first
 390         const size_t nLen = ToWChar(NULL, 0, psz);
 391         if ( nLen != wxCONV_FAILED )
 392         {
 393             // now do the actual conversion
 394             wxWCharBuffer buf(nLen - 1 /* +1 added implicitly */);
 395
 396             // +1 for the trailing NULL
 397             if ( ToWChar(buf.data(), nLen, psz) != wxCONV_FAILED )
 398                 return buf;
 399         }
 400     }
 401
 402     return wxWCharBuffer();
 403 }
 404
 405 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 406 {
 407     if ( pwz )
 408     {
 409         const size_t nLen = FromWChar(NULL, 0, pwz);
 410         if ( nLen != wxCONV_FAILED )
 411         {
 412             wxCharBuffer buf(nLen - 1);
 413             if ( FromWChar(buf.data(), nLen, pwz) != wxCONV_FAILED )
 414                 return buf;
 415         }
 416     }
 417
 418     return wxCharBuffer();
 419 }
 420
 421 const wxWCharBuffer
 422 wxMBConv::cMB2WC(const char *inBuff, size_t inLen, size_t *outLen) const
 423 {
 424     const size_t dstLen = ToWChar(NULL, 0, inBuff, inLen);
 425     if ( dstLen != wxCONV_FAILED )
 426     {
 427         // notice that we allocate space for dstLen+1 wide characters here
 428         // because we want the buffer to always be NUL-terminated, even if the
 429         // input isn't (as otherwise the caller has no way to know its length)
 430         wxWCharBuffer wbuf(dstLen);
 431         wbuf.data()[dstLen] = L'\0';
 432         if ( ToWChar(wbuf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 433         {
 434             if ( outLen )
 435             {
 436                 *outLen = dstLen;
 437
 438                 // we also need to handle NUL-terminated input strings
 439                 // specially: for them the output is the length of the string
 440                 // excluding the trailing NUL, however if we're asked to
 441                 // convert a specific number of characters we return the length
 442                 // of the resulting output even if it's NUL-terminated
 443                 if ( inLen == wxNO_LEN )
 444                     (*outLen)--;
 445             }
 446
 447             return wbuf;
 448         }
 449     }
 450
 451     if ( outLen )
 452         *outLen = 0;
 453
 454     return wxWCharBuffer();
 455 }
 456
 457 const wxCharBuffer
 458 wxMBConv::cWC2MB(const wchar_t *inBuff, size_t inLen, size_t *outLen) const
 459 {
 460     size_t dstLen = FromWChar(NULL, 0, inBuff, inLen);
 461     if ( dstLen != wxCONV_FAILED )
 462     {
 463         const size_t nulLen = GetMBNulLen();
 464
 465         // as above, ensure that the buffer is always NUL-terminated, even if
 466         // the input is not
 467         wxCharBuffer buf(dstLen + nulLen - 1);
 468         memset(buf.data() + dstLen, 0, nulLen);
 469         if ( FromWChar(buf.data(), dstLen, inBuff, inLen) != wxCONV_FAILED )
 470         {
 471             if ( outLen )
 472             {
 473                 *outLen = dstLen;
 474
 475                 if ( inLen == wxNO_LEN )
 476                 {
 477                     // in this case both input and output are NUL-terminated
 478                     // and we're not supposed to count NUL
 479                     *outLen -= nulLen;
 480                 }
 481             }
 482
 483             return buf;
 484         }
 485     }
 486
 487     if ( outLen )
 488         *outLen = 0;
 489
 490     return wxCharBuffer();
 491 }
 492
 493 // ----------------------------------------------------------------------------
 494 // wxMBConvLibc
 495 // ----------------------------------------------------------------------------
 496
 497 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 498 {
 499     return wxMB2WC(buf, psz, n);
 500 }
 501
 502 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 503 {
 504     return wxWC2MB(buf, psz, n);
 505 }
 506
 507 // ----------------------------------------------------------------------------
 508 // wxConvBrokenFileNames
 509 // ----------------------------------------------------------------------------
 510
 511 #ifdef __UNIX__
 512
 513 wxConvBrokenFileNames::wxConvBrokenFileNames(const wxString& charset)
 514 {
 515     if ( wxStricmp(charset, _T("UTF-8")) == 0 ||
 516          wxStricmp(charset, _T("UTF8")) == 0  )
 517         m_conv = new wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_PUA);
 518     else
 519         m_conv = new wxCSConv(charset);
 520 }
 521
 522 #endif // __UNIX__
 523
 524 // ----------------------------------------------------------------------------
 525 // UTF-7
 526 // ----------------------------------------------------------------------------
 527
 528 // Implementation (C) 2004 Fredrik Roubert
 529 //
 530 // Changes to work in streaming mode (C) 2008 Vadim Zeitlin
 531
 532 //
 533 // BASE64 decoding table
 534 //
 535 static const unsigned char utf7unb64[] =
 536 {
 537     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 538     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 539     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 540     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 541     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 542     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 543     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 544     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 545     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 546     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 547     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 548     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 549     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 550     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 551     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 552     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 553     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 554     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 555     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 556     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 557     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 558     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 559     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 560     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 561     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 562     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 563     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 564     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 565     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 566     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 567     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 568     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 569 };
 570
 571 size_t wxMBConvUTF7::ToWChar(wchar_t *dst, size_t dstLen,
 572                              const char *src, size_t srcLen) const
 573 {
 574     DecoderState stateOrig,
 575                 *statePtr;
 576     if ( srcLen == wxNO_LEN )
 577     {
 578         // convert the entire string, up to and including the trailing NUL
 579         srcLen = strlen(src) + 1;
 580
 581         // when working on the entire strings we don't update nor use the shift
 582         // state from the previous call
 583         statePtr = &stateOrig;
 584     }
 585     else // when working with partial strings we do use the shift state
 586     {
 587         statePtr = const_cast<DecoderState *>(&m_stateDecoder);
 588
 589         // also save the old state to be able to rollback to it on error
 590         stateOrig = m_stateDecoder;
 591     }
 592
 593     // but to simplify the code below we use this variable in both cases
 594     DecoderState& state = *statePtr;
 595
 596
 597     // number of characters [which would have been] written to dst [if it were
 598     // not NULL]
 599     size_t len = 0;
 600
 601     const char * const srcEnd = src + srcLen;
 602
 603     while ( (src < srcEnd) && (!dst || (len < dstLen)) )
 604     {
 605         const unsigned char cc = *src++;
 606
 607         if ( state.IsShifted() )
 608         {
 609             const unsigned char dc = utf7unb64[cc];
 610             if ( dc == 0xff )
 611             {
 612                 // end of encoded part, check that nothing was left: there can
 613                 // be up to 4 bits of 0 padding but nothing else (we also need
 614                 // to check isLSB as we count bits modulo 8 while a valid UTF-7
 615                 // encoded sequence must contain an integral number of UTF-16
 616                 // characters)
 617                 if ( state.isLSB || state.bit > 4 ||
 618                         (state.accum & ((1 << state.bit) - 1)) )
 619                 {
 620                     if ( !len )
 621                         state = stateOrig;
 622
 623                     return wxCONV_FAILED;
 624                 }
 625
 626                 state.ToDirect();
 627
 628                 // re-parse this character normally below unless it's '-' which
 629                 // is consumed by the decoder
 630                 if ( cc == '-' )
 631                     continue;
 632             }
 633             else // valid encoded character
 634             {
 635                 // mini base64 decoder: each character is 6 bits
 636                 state.bit += 6;
 637                 state.accum <<= 6;
 638                 state.accum += dc;
 639
 640                 if ( state.bit >= 8 )
 641                 {
 642                     // got the full byte, consume it
 643                     state.bit -= 8;
 644                     unsigned char b = (state.accum >> state.bit) & 0x00ff;
 645
 646                     if ( state.isLSB )
 647                     {
 648                         // we've got the full word, output it
 649                         if ( dst )
 650                             *dst++ = (state.msb << 8) | b;
 651                         len++;
 652                         state.isLSB = false;
 653                     }
 654                     else // MSB
 655                     {
 656                         // just store it while we wait for LSB
 657                         state.msb = b;
 658                         state.isLSB = true;
 659                     }
 660                 }
 661             }
 662         }
 663
 664         if ( state.IsDirect() )
 665         {
 666             // start of an encoded segment?
 667             if ( cc == '+' )
 668             {
 669                 if ( *src == '-' )
 670                 {
 671                     // just the encoded plus sign, don't switch to shifted mode
 672                     if ( dst )
 673                         *dst++ = '+';
 674                     len++;
 675                     src++;
 676                 }
 677                 else if ( utf7unb64[(unsigned)*src] == 0xff )
 678                 {
 679                     // empty encoded chunks are not allowed
 680                     if ( !len )
 681                         state = stateOrig;
 682
 683                     return wxCONV_FAILED;
 684                 }
 685                 else // base-64 encoded chunk follows
 686                 {
 687                     state.ToShifted();
 688                 }
 689             }
 690             else // not '+'
 691             {
 692                 // only printable 7 bit ASCII characters (with the exception of
 693                 // NUL, TAB, CR and LF) can be used directly
 694                 if ( cc >= 0x7f || (cc < ' ' &&
 695                       !(cc == '\0' || cc == '\t' || cc == '\r' || cc == '\n')) )
 696                     return wxCONV_FAILED;
 697
 698                 if ( dst )
 699                     *dst++ = cc;
 700                 len++;
 701             }
 702         }
 703     }
 704
 705     if ( !len )
 706     {
 707         // as we didn't read any characters we should be called with the same
 708         // data (followed by some more new data) again later so don't save our
 709         // state
 710         state = stateOrig;
 711
 712         return wxCONV_FAILED;
 713     }
 714
 715     return len;
 716 }
 717
 718 //
 719 // BASE64 encoding table
 720 //
 721 static const unsigned char utf7enb64[] =
 722 {
 723     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 724     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 725     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 726     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 727     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 728     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 729     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 730     '4', '5', '6', '7', '8', '9', '+', '/'
 731 };
 732
 733 //
 734 // UTF-7 encoding table
 735 //
 736 // 0 - Set D (directly encoded characters)
 737 // 1 - Set O (optional direct characters)
 738 // 2 - whitespace characters (optional)
 739 // 3 - special characters
 740 //
 741 static const unsigned char utf7encode[128] =
 742 {
 743     0, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 744     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 745     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 746     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 747     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 748     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 749     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 750     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 751 };
 752
 753 static inline bool wxIsUTF7Direct(wchar_t wc)
 754 {
 755     return wc < 0x80 && utf7encode[wc] < 1;
 756 }
 757
 758 size_t wxMBConvUTF7::FromWChar(char *dst, size_t dstLen,
 759                                const wchar_t *src, size_t srcLen) const
 760 {
 761     EncoderState stateOrig,
 762                 *statePtr;
 763     if ( srcLen == wxNO_LEN )
 764     {
 765         // we don't apply the stored state when operating on entire strings at
 766         // once
 767         statePtr = &stateOrig;
 768
 769         srcLen = wxWcslen(src) + 1;
 770     }
 771     else // do use the mode we left the output in previously
 772     {
 773         stateOrig = m_stateEncoder;
 774         statePtr = const_cast<EncoderState *>(&m_stateEncoder);
 775     }
 776
 777     EncoderState& state = *statePtr;
 778
 779
 780     size_t len = 0;
 781
 782     const wchar_t * const srcEnd = src + srcLen;
 783     while ( src < srcEnd && (!dst || len < dstLen) )
 784     {
 785         wchar_t cc = *src++;
 786         if ( wxIsUTF7Direct(cc) )
 787         {
 788             if ( state.IsShifted() )
 789             {
 790                 // pad with zeros the last encoded block if necessary
 791                 if ( state.bit )
 792                 {
 793                     if ( dst )
 794                         *dst++ = utf7enb64[((state.accum % 16) << (6 - state.bit)) % 64];
 795                     len++;
 796                 }
 797
 798                 state.ToDirect();
 799
 800                 if ( dst )
 801                     *dst++ = '-';
 802                 len++;
 803             }
 804
 805             if ( dst )
 806                 *dst++ = (char)cc;
 807             len++;
 808         }
 809         else if ( cc == '+' && state.IsDirect() )
 810         {
 811             if ( dst )
 812             {
 813                 *dst++ = '+';
 814                 *dst++ = '-';
 815             }
 816
 817             len += 2;
 818         }
 819 #ifndef WC_UTF16
 820         else if (((wxUint32)cc) > 0xffff)
 821         {
 822             // no surrogate pair generation (yet?)
 823             return wxCONV_FAILED;
 824         }
 825 #endif
 826         else
 827         {
 828             if ( state.IsDirect() )
 829             {
 830                 state.ToShifted();
 831
 832                 if ( dst )
 833                     *dst++ = '+';
 834                 len++;
 835             }
 836
 837             // BASE64 encode string
 838             for ( ;; )
 839             {
 840                 for ( unsigned lsb = 0; lsb < 2; lsb++ )
 841                 {
 842                     state.accum <<= 8;
 843                     state.accum += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 844
 845                     for (state.bit += 8; state.bit >= 6; )
 846                     {
 847                         state.bit -= 6;
 848                         if ( dst )
 849                             *dst++ = utf7enb64[(state.accum >> state.bit) % 64];
 850                         len++;
 851                     }
 852                 }
 853
 854                 if ( src == srcEnd || wxIsUTF7Direct(cc = *src) )
 855                     break;
 856
 857                 src++;
 858             }
 859         }
 860     }
 861
 862     // we need to restore the original encoder state if we were called just to
 863     // calculate the amount of space needed as we will presumably be called
 864     // again to really convert the data now
 865     if ( !dst )
 866         state = stateOrig;
 867
 868     return len;
 869 }
 870
 871 // ----------------------------------------------------------------------------
 872 // UTF-8
 873 // ----------------------------------------------------------------------------
 874
 875 static const wxUint32 utf8_max[]=
 876     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 877
 878 // boundaries of the private use area we use to (temporarily) remap invalid
 879 // characters invalid in a UTF-8 encoded string
 880 const wxUint32 wxUnicodePUA = 0x100000;
 881 const wxUint32 wxUnicodePUAEnd = wxUnicodePUA + 256;
 882
 883 // this table gives the length of the UTF-8 encoding from its first character:
 884 const unsigned char tableUtf8Lengths[256] = {
 885     // single-byte sequences (ASCII):
 886     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 00..0F
 887     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 10..1F
 888     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 20..2F
 889     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 30..3F
 890     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 40..4F
 891     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 50..5F
 892     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 60..6F
 893     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 70..7F
 894
 895     // these are invalid:
 896     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 80..8F
 897     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 90..9F
 898     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // A0..AF
 899     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // B0..BF
 900     0, 0,                                            // C0,C1
 901
 902     // two-byte sequences:
 903           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // C2..CF
 904     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,  // D0..DF
 905
 906     // three-byte sequences:
 907     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,  // E0..EF
 908
 909     // four-byte sequences:
 910     4, 4, 4, 4, 4,                                   // F0..F4
 911
 912     // these are invalid again (5- or 6-byte
 913     // sequences and sequences for code points
 914     // above U+10FFFF, as restricted by RFC 3629):
 915                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0   // F5..FF
 916 };
 917
 918 size_t
 919 wxMBConvStrictUTF8::ToWChar(wchar_t *dst, size_t dstLen,
 920                             const char *src, size_t srcLen) const
 921 {
 922     wchar_t *out = dstLen ? dst : NULL;
 923     size_t written = 0;
 924
 925     if ( srcLen == wxNO_LEN )
 926         srcLen = strlen(src) + 1;
 927
 928     for ( const char *p = src; ; p++ )
 929     {
 930         if ( !(srcLen == wxNO_LEN ? *p : srcLen) )
 931         {
 932             // all done successfully, just add the trailing NULL if we are not
 933             // using explicit length
 934             if ( srcLen == wxNO_LEN )
 935             {
 936                 if ( out )
 937                 {
 938                     if ( !dstLen )
 939                         break;
 940
 941                     *out = L'\0';
 942                 }
 943
 944                 written++;
 945             }
 946
 947             return written;
 948         }
 949
 950         if ( out && !dstLen-- )
 951             break;
 952
 953         wxUint32 code;
 954         unsigned char c = *p;
 955
 956         if ( c < 0x80 )
 957         {
 958             if ( srcLen == 0 ) // the test works for wxNO_LEN too
 959                 break;
 960
 961             if ( srcLen != wxNO_LEN )
 962                 srcLen--;
 963
 964             code = c;
 965         }
 966         else
 967         {
 968             unsigned len = tableUtf8Lengths[c];
 969             if ( !len )
 970                 break;
 971
 972             if ( srcLen < len ) // the test works for wxNO_LEN too
 973                 break;
 974
 975             if ( srcLen != wxNO_LEN )
 976                 srcLen -= len;
 977
 978             //   Char. number range   |        UTF-8 octet sequence
 979             //      (hexadecimal)     |              (binary)
 980             //  ----------------------+----------------------------------------
 981             //  0000 0000 - 0000 007F | 0xxxxxxx
 982             //  0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx
 983             //  0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
 984             //  0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 985             //
 986             //  Code point value is stored in bits marked with 'x',
 987             //  lowest-order bit of the value on the right side in the diagram
 988             //  above.                                         (from RFC 3629)
 989
 990             // mask to extract lead byte's value ('x' bits above), by sequence
 991             // length:
 992             static const unsigned char leadValueMask[] = { 0x7F, 0x1F, 0x0F, 0x07 };
 993
 994             // mask and value of lead byte's most significant bits, by length:
 995             static const unsigned char leadMarkerMask[] = { 0x80, 0xE0, 0xF0, 0xF8 };
 996             static const unsigned char leadMarkerVal[] = { 0x00, 0xC0, 0xE0, 0xF0 };
 997
 998             len--; // it's more convenient to work with 0-based length here
 999
1000             // extract the lead byte's value bits:
1001             if ( (c & leadMarkerMask[len]) != leadMarkerVal[len] )
1002                 break;
1003
1004             code = c & leadValueMask[len];
1005
1006             // all remaining bytes, if any, are handled in the same way
1007             // regardless of sequence's length:
1008             for ( ; len; --len )
1009             {
1010                 c = *++p;
1011                 if ( (c & 0xC0) != 0x80 )
1012                     return wxCONV_FAILED;
1013
1014                 code <<= 6;
1015                 code |= c & 0x3F;
1016             }
1017         }
1018
1019 #ifdef WC_UTF16
1020         // cast is ok because wchar_t == wxUint16 if WC_UTF16
1021         if ( encode_utf16(code, (wxUint16 *)out) == 2 )
1022         {
1023             if ( out )
1024                 out++;
1025             written++;
1026         }
1027 #else // !WC_UTF16
1028         if ( out )
1029             *out = code;
1030 #endif // WC_UTF16/!WC_UTF16
1031
1032         if ( out )
1033             out++;
1034
1035         written++;
1036     }
1037
1038     return wxCONV_FAILED;
1039 }
1040
1041 size_t
1042 wxMBConvStrictUTF8::FromWChar(char *dst, size_t dstLen,
1043                               const wchar_t *src, size_t srcLen) const
1044 {
1045     char *out = dstLen ? dst : NULL;
1046     size_t written = 0;
1047
1048     for ( const wchar_t *wp = src; ; wp++ )
1049     {
1050         if ( !(srcLen == wxNO_LEN ? *wp : srcLen) )
1051         {
1052             // all done successfully, just add the trailing NULL if we are not
1053             // using explicit length
1054             if ( srcLen == wxNO_LEN )
1055             {
1056                 if ( out )
1057                 {
1058                     if ( !dstLen )
1059                         break;
1060
1061                     *out = '\0';
1062                 }
1063
1064                 written++;
1065             }
1066
1067             return written;
1068         }
1069
1070         if ( srcLen != wxNO_LEN )
1071             srcLen--;
1072
1073         wxUint32 code;
1074 #ifdef WC_UTF16
1075         // cast is ok for WC_UTF16
1076         if ( decode_utf16((const wxUint16 *)wp, code) == 2 )
1077         {
1078             // skip the next char too as we decoded a surrogate
1079             wp++;
1080         }
1081 #else // wchar_t is UTF-32
1082         code = *wp & 0x7fffffff;
1083 #endif
1084
1085         unsigned len;
1086         if ( code <= 0x7F )
1087         {
1088             len = 1;
1089             if ( out )
1090             {
1091                 if ( dstLen < len )
1092                     break;
1093
1094                 out[0] = (char)code;
1095             }
1096         }
1097         else if ( code <= 0x07FF )
1098         {
1099             len = 2;
1100             if ( out )
1101             {
1102                 if ( dstLen < len )
1103                     break;
1104
1105                 // NB: this line takes 6 least significant bits, encodes them as
1106                 // 10xxxxxx and discards them so that the next byte can be encoded:
1107                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1108                 out[0] = 0xC0 | code;
1109             }
1110         }
1111         else if ( code < 0xFFFF )
1112         {
1113             len = 3;
1114             if ( out )
1115             {
1116                 if ( dstLen < len )
1117                     break;
1118
1119                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1120                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1121                 out[0] = 0xE0 | code;
1122             }
1123         }
1124         else if ( code <= 0x10FFFF )
1125         {
1126             len = 4;
1127             if ( out )
1128             {
1129                 if ( dstLen < len )
1130                     break;
1131
1132                 out[3] = 0x80 | (code & 0x3F);  code >>= 6;
1133                 out[2] = 0x80 | (code & 0x3F);  code >>= 6;
1134                 out[1] = 0x80 | (code & 0x3F);  code >>= 6;
1135                 out[0] = 0xF0 | code;
1136             }
1137         }
1138         else
1139         {
1140             wxFAIL_MSG( _T("trying to encode undefined Unicode character") );
1141             break;
1142         }
1143
1144         if ( out )
1145         {
1146             out += len;
1147             dstLen -= len;
1148         }
1149
1150         written += len;
1151     }
1152
1153     // we only get here if an error occurs during decoding
1154     return wxCONV_FAILED;
1155 }
1156
1157 size_t wxMBConvUTF8::ToWChar(wchar_t *buf, size_t n,
1158                              const char *psz, size_t srcLen) const
1159 {
1160     if ( m_options == MAP_INVALID_UTF8_NOT )
1161         return wxMBConvStrictUTF8::ToWChar(buf, n, psz, srcLen);
1162
1163     size_t len = 0;
1164
1165     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1166     {
1167         const char *opsz = psz;
1168         bool invalid = false;
1169         unsigned char cc = *psz++, fc = cc;
1170         unsigned cnt;
1171         for (cnt = 0; fc & 0x80; cnt++)
1172             fc <<= 1;
1173
1174         if (!cnt)
1175         {
1176             // plain ASCII char
1177             if (buf)
1178                 *buf++ = cc;
1179             len++;
1180
1181             // escape the escape character for octal escapes
1182             if ((m_options & MAP_INVALID_UTF8_TO_OCTAL)
1183                     && cc == '\\' && (!buf || len < n))
1184             {
1185                 if (buf)
1186                     *buf++ = cc;
1187                 len++;
1188             }
1189         }
1190         else
1191         {
1192             cnt--;
1193             if (!cnt)
1194             {
1195                 // invalid UTF-8 sequence
1196                 invalid = true;
1197             }
1198             else
1199             {
1200                 unsigned ocnt = cnt - 1;
1201                 wxUint32 res = cc & (0x3f >> cnt);
1202                 while (cnt--)
1203                 {
1204                     cc = *psz;
1205                     if ((cc & 0xC0) != 0x80)
1206                     {
1207                         // invalid UTF-8 sequence
1208                         invalid = true;
1209                         break;
1210                     }
1211
1212                     psz++;
1213                     res = (res << 6) | (cc & 0x3f);
1214                 }
1215
1216                 if (invalid || res <= utf8_max[ocnt])
1217                 {
1218                     // illegal UTF-8 encoding
1219                     invalid = true;
1220                 }
1221                 else if ((m_options & MAP_INVALID_UTF8_TO_PUA) &&
1222                         res >= wxUnicodePUA && res < wxUnicodePUAEnd)
1223                 {
1224                     // if one of our PUA characters turns up externally
1225                     // it must also be treated as an illegal sequence
1226                     // (a bit like you have to escape an escape character)
1227                     invalid = true;
1228                 }
1229                 else
1230                 {
1231 #ifdef WC_UTF16
1232                     // cast is ok because wchar_t == wxUint16 if WC_UTF16
1233                     size_t pa = encode_utf16(res, (wxUint16 *)buf);
1234                     if (pa == wxCONV_FAILED)
1235                     {
1236                         invalid = true;
1237                     }
1238                     else
1239                     {
1240                         if (buf)
1241                             buf += pa;
1242                         len += pa;
1243                     }
1244 #else // !WC_UTF16
1245                     if (buf)
1246                         *buf++ = (wchar_t)res;
1247                     len++;
1248 #endif // WC_UTF16/!WC_UTF16
1249                 }
1250             }
1251
1252             if (invalid)
1253             {
1254                 if (m_options & MAP_INVALID_UTF8_TO_PUA)
1255                 {
1256                     while (opsz < psz && (!buf || len < n))
1257                     {
1258 #ifdef WC_UTF16
1259                         // cast is ok because wchar_t == wxUuint16 if WC_UTF16
1260                         size_t pa = encode_utf16((unsigned char)*opsz + wxUnicodePUA, (wxUint16 *)buf);
1261                         wxASSERT(pa != wxCONV_FAILED);
1262                         if (buf)
1263                             buf += pa;
1264                         opsz++;
1265                         len += pa;
1266 #else
1267                         if (buf)
1268                             *buf++ = (wchar_t)(wxUnicodePUA + (unsigned char)*opsz);
1269                         opsz++;
1270                         len++;
1271 #endif
1272                     }
1273                 }
1274                 else if (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1275                 {
1276                     while (opsz < psz && (!buf || len < n))
1277                     {
1278                         if ( buf && len + 3 < n )
1279                         {
1280                             unsigned char on = *opsz;
1281                             *buf++ = L'\\';
1282                             *buf++ = (wchar_t)( L'0' + on / 0100 );
1283                             *buf++ = (wchar_t)( L'0' + (on % 0100) / 010 );
1284                             *buf++ = (wchar_t)( L'0' + on % 010 );
1285                         }
1286
1287                         opsz++;
1288                         len += 4;
1289                     }
1290                 }
1291                 else // MAP_INVALID_UTF8_NOT
1292                 {
1293                     return wxCONV_FAILED;
1294                 }
1295             }
1296         }
1297     }
1298
1299     if (srcLen == wxNO_LEN && buf && (len < n))
1300         *buf = 0;
1301
1302     return len + 1;
1303 }
1304
1305 static inline bool isoctal(wchar_t wch)
1306 {
1307     return L'0' <= wch && wch <= L'7';
1308 }
1309
1310 size_t wxMBConvUTF8::FromWChar(char *buf, size_t n,
1311                                const wchar_t *psz, size_t srcLen) const
1312 {
1313     if ( m_options == MAP_INVALID_UTF8_NOT )
1314         return wxMBConvStrictUTF8::FromWChar(buf, n, psz, srcLen);
1315
1316     size_t len = 0;
1317
1318     while ((srcLen == wxNO_LEN ? *psz : srcLen--) && ((!buf) || (len < n)))
1319     {
1320         wxUint32 cc;
1321
1322 #ifdef WC_UTF16
1323         // cast is ok for WC_UTF16
1324         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
1325         psz += (pa == wxCONV_FAILED) ? 1 : pa;
1326 #else
1327         cc = (*psz++) & 0x7fffffff;
1328 #endif
1329
1330         if ( (m_options & MAP_INVALID_UTF8_TO_PUA)
1331                 && cc >= wxUnicodePUA && cc < wxUnicodePUAEnd )
1332         {
1333             if (buf)
1334                 *buf++ = (char)(cc - wxUnicodePUA);
1335             len++;
1336         }
1337         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL)
1338                     && cc == L'\\' && psz[0] == L'\\' )
1339         {
1340             if (buf)
1341                 *buf++ = (char)cc;
1342             psz++;
1343             len++;
1344         }
1345         else if ( (m_options & MAP_INVALID_UTF8_TO_OCTAL) &&
1346                     cc == L'\\' &&
1347                         isoctal(psz[0]) && isoctal(psz[1]) && isoctal(psz[2]) )
1348         {
1349             if (buf)
1350             {
1351                 *buf++ = (char) ((psz[0] - L'0') * 0100 +
1352                                  (psz[1] - L'0') * 010 +
1353                                  (psz[2] - L'0'));
1354             }
1355
1356             psz += 3;
1357             len++;
1358         }
1359         else
1360         {
1361             unsigned cnt;
1362             for (cnt = 0; cc > utf8_max[cnt]; cnt++)
1363             {
1364             }
1365
1366             if (!cnt)
1367             {
1368                 // plain ASCII char
1369                 if (buf)
1370                     *buf++ = (char) cc;
1371                 len++;
1372             }
1373             else
1374             {
1375                 len += cnt + 1;
1376                 if (buf)
1377                 {
1378                     *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
1379                     while (cnt--)
1380                         *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
1381                 }
1382             }
1383         }
1384     }
1385
1386     if (srcLen == wxNO_LEN && buf && (len < n))
1387         *buf = 0;
1388
1389     return len + 1;
1390 }
1391
1392 // ============================================================================
1393 // UTF-16
1394 // ============================================================================
1395
1396 #ifdef WORDS_BIGENDIAN
1397     #define wxMBConvUTF16straight wxMBConvUTF16BE
1398     #define wxMBConvUTF16swap     wxMBConvUTF16LE
1399 #else
1400     #define wxMBConvUTF16swap     wxMBConvUTF16BE
1401     #define wxMBConvUTF16straight wxMBConvUTF16LE
1402 #endif
1403
1404 /* static */
1405 size_t wxMBConvUTF16Base::GetLength(const char *src, size_t srcLen)
1406 {
1407     if ( srcLen == wxNO_LEN )
1408     {
1409         // count the number of bytes in input, including the trailing NULs
1410         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1411         for ( srcLen = 1; *inBuff++; srcLen++ )
1412             ;
1413
1414         srcLen *= BYTES_PER_CHAR;
1415     }
1416     else // we already have the length
1417     {
1418         // we can only convert an entire number of UTF-16 characters
1419         if ( srcLen % BYTES_PER_CHAR )
1420             return wxCONV_FAILED;
1421     }
1422
1423     return srcLen;
1424 }
1425
1426 // case when in-memory representation is UTF-16 too
1427 #ifdef WC_UTF16
1428
1429 // ----------------------------------------------------------------------------
1430 // conversions without endianness change
1431 // ----------------------------------------------------------------------------
1432
1433 size_t
1434 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1435                                const char *src, size_t srcLen) const
1436 {
1437     // set up the scene for using memcpy() (which is presumably more efficient
1438     // than copying the bytes one by one)
1439     srcLen = GetLength(src, srcLen);
1440     if ( srcLen == wxNO_LEN )
1441         return wxCONV_FAILED;
1442
1443     const size_t inLen = srcLen / BYTES_PER_CHAR;
1444     if ( dst )
1445     {
1446         if ( dstLen < inLen )
1447             return wxCONV_FAILED;
1448
1449         memcpy(dst, src, srcLen);
1450     }
1451
1452     return inLen;
1453 }
1454
1455 size_t
1456 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1457                                  const wchar_t *src, size_t srcLen) const
1458 {
1459     if ( srcLen == wxNO_LEN )
1460         srcLen = wxWcslen(src) + 1;
1461
1462     srcLen *= BYTES_PER_CHAR;
1463
1464     if ( dst )
1465     {
1466         if ( dstLen < srcLen )
1467             return wxCONV_FAILED;
1468
1469         memcpy(dst, src, srcLen);
1470     }
1471
1472     return srcLen;
1473 }
1474
1475 // ----------------------------------------------------------------------------
1476 // endian-reversing conversions
1477 // ----------------------------------------------------------------------------
1478
1479 size_t
1480 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1481                            const char *src, size_t srcLen) const
1482 {
1483     srcLen = GetLength(src, srcLen);
1484     if ( srcLen == wxNO_LEN )
1485         return wxCONV_FAILED;
1486
1487     srcLen /= BYTES_PER_CHAR;
1488
1489     if ( dst )
1490     {
1491         if ( dstLen < srcLen )
1492             return wxCONV_FAILED;
1493
1494         const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1495         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1496         {
1497             *dst++ = wxUINT16_SWAP_ALWAYS(*inBuff);
1498         }
1499     }
1500
1501     return srcLen;
1502 }
1503
1504 size_t
1505 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1506                              const wchar_t *src, size_t srcLen) const
1507 {
1508     if ( srcLen == wxNO_LEN )
1509         srcLen = wxWcslen(src) + 1;
1510
1511     srcLen *= BYTES_PER_CHAR;
1512
1513     if ( dst )
1514     {
1515         if ( dstLen < srcLen )
1516             return wxCONV_FAILED;
1517
1518         wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1519         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1520         {
1521             *outBuff++ = wxUINT16_SWAP_ALWAYS(*src);
1522         }
1523     }
1524
1525     return srcLen;
1526 }
1527
1528 #else // !WC_UTF16: wchar_t is UTF-32
1529
1530 // ----------------------------------------------------------------------------
1531 // conversions without endianness change
1532 // ----------------------------------------------------------------------------
1533
1534 size_t
1535 wxMBConvUTF16straight::ToWChar(wchar_t *dst, size_t dstLen,
1536                                const char *src, size_t srcLen) const
1537 {
1538     srcLen = GetLength(src, srcLen);
1539     if ( srcLen == wxNO_LEN )
1540         return wxCONV_FAILED;
1541
1542     const size_t inLen = srcLen / BYTES_PER_CHAR;
1543     if ( !dst )
1544     {
1545         // optimization: return maximal space which could be needed for this
1546         // string even if the real size could be smaller if the buffer contains
1547         // any surrogates
1548         return inLen;
1549     }
1550
1551     size_t outLen = 0;
1552     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1553     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1554     {
1555         const wxUint32 ch = wxDecodeSurrogate(&inBuff);
1556         if ( !inBuff )
1557             return wxCONV_FAILED;
1558
1559         if ( ++outLen > dstLen )
1560             return wxCONV_FAILED;
1561
1562         *dst++ = ch;
1563     }
1564
1565
1566     return outLen;
1567 }
1568
1569 size_t
1570 wxMBConvUTF16straight::FromWChar(char *dst, size_t dstLen,
1571                                  const wchar_t *src, size_t srcLen) const
1572 {
1573     if ( srcLen == wxNO_LEN )
1574         srcLen = wxWcslen(src) + 1;
1575
1576     size_t outLen = 0;
1577     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1578     for ( size_t n = 0; n < srcLen; n++ )
1579     {
1580         wxUint16 cc[2];
1581         const size_t numChars = encode_utf16(*src++, cc);
1582         if ( numChars == wxCONV_FAILED )
1583             return wxCONV_FAILED;
1584
1585         outLen += numChars * BYTES_PER_CHAR;
1586         if ( outBuff )
1587         {
1588             if ( outLen > dstLen )
1589                 return wxCONV_FAILED;
1590
1591             *outBuff++ = cc[0];
1592             if ( numChars == 2 )
1593             {
1594                 // second character of a surrogate
1595                 *outBuff++ = cc[1];
1596             }
1597         }
1598     }
1599
1600     return outLen;
1601 }
1602
1603 // ----------------------------------------------------------------------------
1604 // endian-reversing conversions
1605 // ----------------------------------------------------------------------------
1606
1607 size_t
1608 wxMBConvUTF16swap::ToWChar(wchar_t *dst, size_t dstLen,
1609                            const char *src, size_t srcLen) const
1610 {
1611     srcLen = GetLength(src, srcLen);
1612     if ( srcLen == wxNO_LEN )
1613         return wxCONV_FAILED;
1614
1615     const size_t inLen = srcLen / BYTES_PER_CHAR;
1616     if ( !dst )
1617     {
1618         // optimization: return maximal space which could be needed for this
1619         // string even if the real size could be smaller if the buffer contains
1620         // any surrogates
1621         return inLen;
1622     }
1623
1624     size_t outLen = 0;
1625     const wxUint16 *inBuff = reinterpret_cast<const wxUint16 *>(src);
1626     for ( const wxUint16 * const inEnd = inBuff + inLen; inBuff < inEnd; )
1627     {
1628         wxUint32 ch;
1629         wxUint16 tmp[2];
1630
1631         tmp[0] = wxUINT16_SWAP_ALWAYS(*inBuff);
1632         inBuff++;
1633         tmp[1] = wxUINT16_SWAP_ALWAYS(*inBuff);
1634
1635         const size_t numChars = decode_utf16(tmp, ch);
1636         if ( numChars == wxCONV_FAILED )
1637             return wxCONV_FAILED;
1638
1639         if ( numChars == 2 )
1640             inBuff++;
1641
1642         if ( ++outLen > dstLen )
1643             return wxCONV_FAILED;
1644
1645         *dst++ = ch;
1646     }
1647
1648
1649     return outLen;
1650 }
1651
1652 size_t
1653 wxMBConvUTF16swap::FromWChar(char *dst, size_t dstLen,
1654                              const wchar_t *src, size_t srcLen) const
1655 {
1656     if ( srcLen == wxNO_LEN )
1657         srcLen = wxWcslen(src) + 1;
1658
1659     size_t outLen = 0;
1660     wxUint16 *outBuff = reinterpret_cast<wxUint16 *>(dst);
1661     for ( const wchar_t *srcEnd = src + srcLen; src < srcEnd; src++ )
1662     {
1663         wxUint16 cc[2];
1664         const size_t numChars = encode_utf16(*src, cc);
1665         if ( numChars == wxCONV_FAILED )
1666             return wxCONV_FAILED;
1667
1668         outLen += numChars * BYTES_PER_CHAR;
1669         if ( outBuff )
1670         {
1671             if ( outLen > dstLen )
1672                 return wxCONV_FAILED;
1673
1674             *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[0]);
1675             if ( numChars == 2 )
1676             {
1677                 // second character of a surrogate
1678                 *outBuff++ = wxUINT16_SWAP_ALWAYS(cc[1]);
1679             }
1680         }
1681     }
1682
1683     return outLen;
1684 }
1685
1686 #endif // WC_UTF16/!WC_UTF16
1687
1688
1689 // ============================================================================
1690 // UTF-32
1691 // ============================================================================
1692
1693 #ifdef WORDS_BIGENDIAN
1694     #define wxMBConvUTF32straight  wxMBConvUTF32BE
1695     #define wxMBConvUTF32swap      wxMBConvUTF32LE
1696 #else
1697     #define wxMBConvUTF32swap      wxMBConvUTF32BE
1698     #define wxMBConvUTF32straight  wxMBConvUTF32LE
1699 #endif
1700
1701
1702 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
1703 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
1704
1705 /* static */
1706 size_t wxMBConvUTF32Base::GetLength(const char *src, size_t srcLen)
1707 {
1708     if ( srcLen == wxNO_LEN )
1709     {
1710         // count the number of bytes in input, including the trailing NULs
1711         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1712         for ( srcLen = 1; *inBuff++; srcLen++ )
1713             ;
1714
1715         srcLen *= BYTES_PER_CHAR;
1716     }
1717     else // we already have the length
1718     {
1719         // we can only convert an entire number of UTF-32 characters
1720         if ( srcLen % BYTES_PER_CHAR )
1721             return wxCONV_FAILED;
1722     }
1723
1724     return srcLen;
1725 }
1726
1727 // case when in-memory representation is UTF-16
1728 #ifdef WC_UTF16
1729
1730 // ----------------------------------------------------------------------------
1731 // conversions without endianness change
1732 // ----------------------------------------------------------------------------
1733
1734 size_t
1735 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1736                                const char *src, size_t srcLen) const
1737 {
1738     srcLen = GetLength(src, srcLen);
1739     if ( srcLen == wxNO_LEN )
1740         return wxCONV_FAILED;
1741
1742     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1743     const size_t inLen = srcLen / BYTES_PER_CHAR;
1744     size_t outLen = 0;
1745     for ( size_t n = 0; n < inLen; n++ )
1746     {
1747         wxUint16 cc[2];
1748         const size_t numChars = encode_utf16(*inBuff++, cc);
1749         if ( numChars == wxCONV_FAILED )
1750             return wxCONV_FAILED;
1751
1752         outLen += numChars;
1753         if ( dst )
1754         {
1755             if ( outLen > dstLen )
1756                 return wxCONV_FAILED;
1757
1758             *dst++ = cc[0];
1759             if ( numChars == 2 )
1760             {
1761                 // second character of a surrogate
1762                 *dst++ = cc[1];
1763             }
1764         }
1765     }
1766
1767     return outLen;
1768 }
1769
1770 size_t
1771 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1772                                  const wchar_t *src, size_t srcLen) const
1773 {
1774     if ( srcLen == wxNO_LEN )
1775         srcLen = wxWcslen(src) + 1;
1776
1777     if ( !dst )
1778     {
1779         // optimization: return maximal space which could be needed for this
1780         // string instead of the exact amount which could be less if there are
1781         // any surrogates in the input
1782         //
1783         // we consider that surrogates are rare enough to make it worthwhile to
1784         // avoid running the loop below at the cost of slightly extra memory
1785         // consumption
1786         return srcLen * BYTES_PER_CHAR;
1787     }
1788
1789     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1790     size_t outLen = 0;
1791     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1792     {
1793         const wxUint32 ch = wxDecodeSurrogate(&src);
1794         if ( !src )
1795             return wxCONV_FAILED;
1796
1797         outLen += BYTES_PER_CHAR;
1798
1799         if ( outLen > dstLen )
1800             return wxCONV_FAILED;
1801
1802         *outBuff++ = ch;
1803     }
1804
1805     return outLen;
1806 }
1807
1808 // ----------------------------------------------------------------------------
1809 // endian-reversing conversions
1810 // ----------------------------------------------------------------------------
1811
1812 size_t
1813 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1814                            const char *src, size_t srcLen) const
1815 {
1816     srcLen = GetLength(src, srcLen);
1817     if ( srcLen == wxNO_LEN )
1818         return wxCONV_FAILED;
1819
1820     const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1821     const size_t inLen = srcLen / BYTES_PER_CHAR;
1822     size_t outLen = 0;
1823     for ( size_t n = 0; n < inLen; n++, inBuff++ )
1824     {
1825         wxUint16 cc[2];
1826         const size_t numChars = encode_utf16(wxUINT32_SWAP_ALWAYS(*inBuff), cc);
1827         if ( numChars == wxCONV_FAILED )
1828             return wxCONV_FAILED;
1829
1830         outLen += numChars;
1831         if ( dst )
1832         {
1833             if ( outLen > dstLen )
1834                 return wxCONV_FAILED;
1835
1836             *dst++ = cc[0];
1837             if ( numChars == 2 )
1838             {
1839                 // second character of a surrogate
1840                 *dst++ = cc[1];
1841             }
1842         }
1843     }
1844
1845     return outLen;
1846 }
1847
1848 size_t
1849 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1850                              const wchar_t *src, size_t srcLen) const
1851 {
1852     if ( srcLen == wxNO_LEN )
1853         srcLen = wxWcslen(src) + 1;
1854
1855     if ( !dst )
1856     {
1857         // optimization: return maximal space which could be needed for this
1858         // string instead of the exact amount which could be less if there are
1859         // any surrogates in the input
1860         //
1861         // we consider that surrogates are rare enough to make it worthwhile to
1862         // avoid running the loop below at the cost of slightly extra memory
1863         // consumption
1864         return srcLen*BYTES_PER_CHAR;
1865     }
1866
1867     wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1868     size_t outLen = 0;
1869     for ( const wchar_t * const srcEnd = src + srcLen; src < srcEnd; )
1870     {
1871         const wxUint32 ch = wxDecodeSurrogate(&src);
1872         if ( !src )
1873             return wxCONV_FAILED;
1874
1875         outLen += BYTES_PER_CHAR;
1876
1877         if ( outLen > dstLen )
1878             return wxCONV_FAILED;
1879
1880         *outBuff++ = wxUINT32_SWAP_ALWAYS(ch);
1881     }
1882
1883     return outLen;
1884 }
1885
1886 #else // !WC_UTF16: wchar_t is UTF-32
1887
1888 // ----------------------------------------------------------------------------
1889 // conversions without endianness change
1890 // ----------------------------------------------------------------------------
1891
1892 size_t
1893 wxMBConvUTF32straight::ToWChar(wchar_t *dst, size_t dstLen,
1894                                const char *src, size_t srcLen) const
1895 {
1896     // use memcpy() as it should be much faster than hand-written loop
1897     srcLen = GetLength(src, srcLen);
1898     if ( srcLen == wxNO_LEN )
1899         return wxCONV_FAILED;
1900
1901     const size_t inLen = srcLen/BYTES_PER_CHAR;
1902     if ( dst )
1903     {
1904         if ( dstLen < inLen )
1905             return wxCONV_FAILED;
1906
1907         memcpy(dst, src, srcLen);
1908     }
1909
1910     return inLen;
1911 }
1912
1913 size_t
1914 wxMBConvUTF32straight::FromWChar(char *dst, size_t dstLen,
1915                                  const wchar_t *src, size_t srcLen) const
1916 {
1917     if ( srcLen == wxNO_LEN )
1918         srcLen = wxWcslen(src) + 1;
1919
1920     srcLen *= BYTES_PER_CHAR;
1921
1922     if ( dst )
1923     {
1924         if ( dstLen < srcLen )
1925             return wxCONV_FAILED;
1926
1927         memcpy(dst, src, srcLen);
1928     }
1929
1930     return srcLen;
1931 }
1932
1933 // ----------------------------------------------------------------------------
1934 // endian-reversing conversions
1935 // ----------------------------------------------------------------------------
1936
1937 size_t
1938 wxMBConvUTF32swap::ToWChar(wchar_t *dst, size_t dstLen,
1939                            const char *src, size_t srcLen) const
1940 {
1941     srcLen = GetLength(src, srcLen);
1942     if ( srcLen == wxNO_LEN )
1943         return wxCONV_FAILED;
1944
1945     srcLen /= BYTES_PER_CHAR;
1946
1947     if ( dst )
1948     {
1949         if ( dstLen < srcLen )
1950             return wxCONV_FAILED;
1951
1952         const wxUint32 *inBuff = reinterpret_cast<const wxUint32 *>(src);
1953         for ( size_t n = 0; n < srcLen; n++, inBuff++ )
1954         {
1955             *dst++ = wxUINT32_SWAP_ALWAYS(*inBuff);
1956         }
1957     }
1958
1959     return srcLen;
1960 }
1961
1962 size_t
1963 wxMBConvUTF32swap::FromWChar(char *dst, size_t dstLen,
1964                              const wchar_t *src, size_t srcLen) const
1965 {
1966     if ( srcLen == wxNO_LEN )
1967         srcLen = wxWcslen(src) + 1;
1968
1969     srcLen *= BYTES_PER_CHAR;
1970
1971     if ( dst )
1972     {
1973         if ( dstLen < srcLen )
1974             return wxCONV_FAILED;
1975
1976         wxUint32 *outBuff = reinterpret_cast<wxUint32 *>(dst);
1977         for ( size_t n = 0; n < srcLen; n += BYTES_PER_CHAR, src++ )
1978         {
1979             *outBuff++ = wxUINT32_SWAP_ALWAYS(*src);
1980         }
1981     }
1982
1983     return srcLen;
1984 }
1985
1986 #endif // WC_UTF16/!WC_UTF16
1987
1988
1989 // ============================================================================
1990 // The classes doing conversion using the iconv_xxx() functions
1991 // ============================================================================
1992
1993 #ifdef HAVE_ICONV
1994
1995 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1996 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1997 //     (unless there's yet another bug in glibc) the only case when iconv()
1998 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1999 //     left in the input buffer -- when _real_ error occurs,
2000 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
2001 //     iconv() failure.
2002 //     [This bug does not appear in glibc 2.2.]
2003 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
2004 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
2005                                      (errno != E2BIG || bufLeft != 0))
2006 #else
2007 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
2008 #endif
2009
2010 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
2011
2012 #define ICONV_T_INVALID ((iconv_t)-1)
2013
2014 #if SIZEOF_WCHAR_T == 4
2015     #define WC_BSWAP    wxUINT32_SWAP_ALWAYS
2016     #define WC_ENC      wxFONTENCODING_UTF32
2017 #elif SIZEOF_WCHAR_T == 2
2018     #define WC_BSWAP    wxUINT16_SWAP_ALWAYS
2019     #define WC_ENC      wxFONTENCODING_UTF16
2020 #else // sizeof(wchar_t) != 2 nor 4
2021     // does this ever happen?
2022     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
2023 #endif
2024
2025 // ----------------------------------------------------------------------------
2026 // wxMBConv_iconv: encapsulates an iconv character set
2027 // ----------------------------------------------------------------------------
2028
2029 class wxMBConv_iconv : public wxMBConv
2030 {
2031 public:
2032     wxMBConv_iconv(const char *name);
2033     virtual ~wxMBConv_iconv();
2034
2035     // implement base class virtual methods
2036     virtual size_t ToWChar(wchar_t *dst, size_t dstLen,
2037                            const char *src, size_t srcLen = wxNO_LEN) const;
2038     virtual size_t FromWChar(char *dst, size_t dstLen,
2039                              const wchar_t *src, size_t srcLen = wxNO_LEN) const;
2040     virtual size_t GetMBNulLen() const;
2041
2042 #if wxUSE_UNICODE_UTF8
2043     virtual bool IsUTF8() const;
2044 #endif
2045
2046     virtual wxMBConv *Clone() const
2047     {
2048         wxMBConv_iconv *p = new wxMBConv_iconv(m_name.ToAscii());
2049         p->m_minMBCharWidth = m_minMBCharWidth;
2050         return p;
2051     }
2052
2053     bool IsOk() const
2054         { return (m2w != ICONV_T_INVALID) && (w2m != ICONV_T_INVALID); }
2055
2056 protected:
2057     // the iconv handlers used to translate from multibyte
2058     // to wide char and in the other direction
2059     iconv_t m2w,
2060             w2m;
2061
2062 #if wxUSE_THREADS
2063     // guards access to m2w and w2m objects
2064     wxMutex m_iconvMutex;
2065 #endif
2066
2067 private:
2068     // the name (for iconv_open()) of a wide char charset -- if none is
2069     // available on this machine, it will remain NULL
2070     static wxString ms_wcCharsetName;
2071
2072     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
2073     // different endian-ness than the native one
2074     static bool ms_wcNeedsSwap;
2075
2076
2077     // name of the encoding handled by this conversion
2078     wxString m_name;
2079
2080     // cached result of GetMBNulLen(); set to 0 meaning "unknown"
2081     // initially
2082     size_t m_minMBCharWidth;
2083 };
2084
2085 // make the constructor available for unit testing
2086 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_iconv( const char* name )
2087 {
2088     wxMBConv_iconv* result = new wxMBConv_iconv( name );
2089     if ( !result->IsOk() )
2090     {
2091         delete result;
2092         return 0;
2093     }
2094
2095     return result;
2096 }
2097
2098 wxString wxMBConv_iconv::ms_wcCharsetName;
2099 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
2100
2101 wxMBConv_iconv::wxMBConv_iconv(const char *name)
2102               : m_name(name)
2103 {
2104     m_minMBCharWidth = 0;
2105
2106     // check for charset that represents wchar_t:
2107     if ( ms_wcCharsetName.empty() )
2108     {
2109         wxLogTrace(TRACE_STRCONV, _T("Looking for wide char codeset:"));
2110
2111 #if wxUSE_FONTMAP
2112         const wxChar **names = wxFontMapperBase::GetAllEncodingNames(WC_ENC);
2113 #else // !wxUSE_FONTMAP
2114         static const wxChar *names_static[] =
2115         {
2116 #if SIZEOF_WCHAR_T == 4
2117             _T("UCS-4"),
2118 #elif SIZEOF_WCHAR_T = 2
2119             _T("UCS-2"),
2120 #endif
2121             NULL
2122         };
2123         const wxChar **names = names_static;
2124 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2125
2126         for ( ; *names && ms_wcCharsetName.empty(); ++names )
2127         {
2128             const wxString nameCS(*names);
2129
2130             // first try charset with explicit bytesex info (e.g. "UCS-4LE"):
2131             wxString nameXE(nameCS);
2132
2133 #ifdef WORDS_BIGENDIAN
2134                 nameXE += _T("BE");
2135 #else // little endian
2136                 nameXE += _T("LE");
2137 #endif
2138
2139             wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2140                        nameXE.c_str());
2141
2142             m2w = iconv_open(nameXE.ToAscii(), name);
2143             if ( m2w == ICONV_T_INVALID )
2144             {
2145                 // try charset w/o bytesex info (e.g. "UCS4")
2146                 wxLogTrace(TRACE_STRCONV, _T("  trying charset \"%s\""),
2147                            nameCS.c_str());
2148                 m2w = iconv_open(nameCS.ToAscii(), name);
2149
2150                 // and check for bytesex ourselves:
2151                 if ( m2w != ICONV_T_INVALID )
2152                 {
2153                     char    buf[2], *bufPtr;
2154                     wchar_t wbuf[2];
2155                     size_t  insz, outsz;
2156                     size_t  res;
2157
2158                     buf[0] = 'A';
2159                     buf[1] = 0;
2160                     wbuf[0] = 0;
2161                     insz = 2;
2162                     outsz = SIZEOF_WCHAR_T * 2;
2163                     char* wbufPtr = (char*)wbuf;
2164                     bufPtr = buf;
2165
2166                     res = iconv(
2167                         m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
2168                         &wbufPtr, &outsz);
2169
2170                     if (ICONV_FAILED(res, insz))
2171                     {
2172                         wxLogLastError(wxT("iconv"));
2173                         wxLogError(_("Conversion to charset '%s' doesn't work."),
2174                                    nameCS.c_str());
2175                     }
2176                     else // ok, can convert to this encoding, remember it
2177                     {
2178                         ms_wcCharsetName = nameCS;
2179                         ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
2180                     }
2181                 }
2182             }
2183             else // use charset not requiring byte swapping
2184             {
2185                 ms_wcCharsetName = nameXE;
2186             }
2187         }
2188
2189         wxLogTrace(TRACE_STRCONV,
2190                    wxT("iconv wchar_t charset is \"%s\"%s"),
2191                    ms_wcCharsetName.empty() ? wxString("<none>")
2192                                             : ms_wcCharsetName,
2193                    ms_wcNeedsSwap ? _T(" (needs swap)")
2194                                   : _T(""));
2195     }
2196     else // we already have ms_wcCharsetName
2197     {
2198         m2w = iconv_open(ms_wcCharsetName.ToAscii(), name);
2199     }
2200
2201     if ( ms_wcCharsetName.empty() )
2202     {
2203         w2m = ICONV_T_INVALID;
2204     }
2205     else
2206     {
2207         w2m = iconv_open(name, ms_wcCharsetName.ToAscii());
2208         if ( w2m == ICONV_T_INVALID )
2209         {
2210             wxLogTrace(TRACE_STRCONV,
2211                        wxT("\"%s\" -> \"%s\" works but not the converse!?"),
2212                        ms_wcCharsetName.c_str(), name);
2213         }
2214     }
2215 }
2216
2217 wxMBConv_iconv::~wxMBConv_iconv()
2218 {
2219     if ( m2w != ICONV_T_INVALID )
2220         iconv_close(m2w);
2221     if ( w2m != ICONV_T_INVALID )
2222         iconv_close(w2m);
2223 }
2224
2225 size_t
2226 wxMBConv_iconv::ToWChar(wchar_t *dst, size_t dstLen,
2227                         const char *src, size_t srcLen) const
2228 {
2229     if ( srcLen == wxNO_LEN )
2230     {
2231         // find the string length: notice that must be done differently for
2232         // NUL-terminated strings and UTF-16/32 which are terminated with 2/4
2233         // consecutive NULs
2234         const size_t nulLen = GetMBNulLen();
2235         switch ( nulLen )
2236         {
2237             default:
2238                 return wxCONV_FAILED;
2239
2240             case 1:
2241                 srcLen = strlen(src); // arguably more optimized than our version
2242                 break;
2243
2244             case 2:
2245             case 4:
2246                 // for UTF-16/32 not only we need to have 2/4 consecutive NULs
2247                 // but they also have to start at character boundary and not
2248                 // span two adjacent characters
2249                 const char *p;
2250                 for ( p = src; NotAllNULs(p, nulLen); p += nulLen )
2251                     ;
2252                 srcLen = p - src;
2253                 break;
2254         }
2255
2256         // when we're determining the length of the string ourselves we count
2257         // the terminating NUL(s) as part of it and always NUL-terminate the
2258         // output
2259         srcLen += nulLen;
2260     }
2261
2262     // we express length in the number of (wide) characters but iconv always
2263     // counts buffer sizes it in bytes
2264     dstLen *= SIZEOF_WCHAR_T;
2265
2266 #if wxUSE_THREADS
2267     // NB: iconv() is MT-safe, but each thread must use its own iconv_t handle.
2268     //     Unfortunately there are a couple of global wxCSConv objects such as
2269     //     wxConvLocal that are used all over wx code, so we have to make sure
2270     //     the handle is used by at most one thread at the time. Otherwise
2271     //     only a few wx classes would be safe to use from non-main threads
2272     //     as MB<->WC conversion would fail "randomly".
2273     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2274 #endif // wxUSE_THREADS
2275
2276     size_t res, cres;
2277     const char *pszPtr = src;
2278
2279     if ( dst )
2280     {
2281         char* bufPtr = (char*)dst;
2282
2283         // have destination buffer, convert there
2284         size_t dstLenOrig = dstLen;
2285         cres = iconv(m2w,
2286                      ICONV_CHAR_CAST(&pszPtr), &srcLen,
2287                      &bufPtr, &dstLen);
2288
2289         // convert the number of bytes converted as returned by iconv to the
2290         // number of (wide) characters converted that we need
2291         res = (dstLenOrig - dstLen) / SIZEOF_WCHAR_T;
2292
2293         if (ms_wcNeedsSwap)
2294         {
2295             // convert to native endianness
2296             for ( unsigned i = 0; i < res; i++ )
2297                 dst[i] = WC_BSWAP(dst[i]);
2298         }
2299     }
2300     else // no destination buffer
2301     {
2302         // convert using temp buffer to calculate the size of the buffer needed
2303         wchar_t tbuf[256];
2304         res = 0;
2305
2306         do
2307         {
2308             char* bufPtr = (char*)tbuf;
2309             dstLen = 8 * SIZEOF_WCHAR_T;
2310
2311             cres = iconv(m2w,
2312                          ICONV_CHAR_CAST(&pszPtr), &srcLen,
2313                          &bufPtr, &dstLen );
2314
2315             res += 8 - (dstLen / SIZEOF_WCHAR_T);
2316         }
2317         while ((cres == (size_t)-1) && (errno == E2BIG));
2318     }
2319
2320     if (ICONV_FAILED(cres, srcLen))
2321     {
2322         //VS: it is ok if iconv fails, hence trace only
2323         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2324         return wxCONV_FAILED;
2325     }
2326
2327     return res;
2328 }
2329
2330 size_t wxMBConv_iconv::FromWChar(char *dst, size_t dstLen,
2331                                  const wchar_t *src, size_t srcLen) const
2332 {
2333 #if wxUSE_THREADS
2334     // NB: explained in MB2WC
2335     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
2336 #endif
2337
2338     if ( srcLen == wxNO_LEN )
2339         srcLen = wxWcslen(src) + 1;
2340
2341     size_t inbuflen = srcLen * SIZEOF_WCHAR_T;
2342     size_t outbuflen = dstLen;
2343     size_t res, cres;
2344
2345     wchar_t *tmpbuf = 0;
2346
2347     if (ms_wcNeedsSwap)
2348     {
2349         // need to copy to temp buffer to switch endianness
2350         // (doing WC_BSWAP twice on the original buffer won't work, as it
2351         //  could be in read-only memory, or be accessed in some other thread)
2352         tmpbuf = (wchar_t *)malloc(inbuflen);
2353         for ( size_t i = 0; i < srcLen; i++ )
2354             tmpbuf[i] = WC_BSWAP(src[i]);
2355
2356         src = tmpbuf;
2357     }
2358
2359     char* inbuf = (char*)src;
2360     if ( dst )
2361     {
2362         // have destination buffer, convert there
2363         cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2364
2365         res = dstLen - outbuflen;
2366     }
2367     else // no destination buffer
2368     {
2369         // convert using temp buffer to calculate the size of the buffer needed
2370         char tbuf[256];
2371         res = 0;
2372         do
2373         {
2374             dst = tbuf;
2375             outbuflen = WXSIZEOF(tbuf);
2376
2377             cres = iconv(w2m, ICONV_CHAR_CAST(&inbuf), &inbuflen, &dst, &outbuflen);
2378
2379             res += WXSIZEOF(tbuf) - outbuflen;
2380         }
2381         while ((cres == (size_t)-1) && (errno == E2BIG));
2382     }
2383
2384     if (ms_wcNeedsSwap)
2385     {
2386         free(tmpbuf);
2387     }
2388
2389     if (ICONV_FAILED(cres, inbuflen))
2390     {
2391         wxLogTrace(TRACE_STRCONV, wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
2392         return wxCONV_FAILED;
2393     }
2394
2395     return res;
2396 }
2397
2398 size_t wxMBConv_iconv::GetMBNulLen() const
2399 {
2400     if ( m_minMBCharWidth == 0 )
2401     {
2402         wxMBConv_iconv * const self = wxConstCast(this, wxMBConv_iconv);
2403
2404 #if wxUSE_THREADS
2405         // NB: explained in MB2WC
2406         wxMutexLocker lock(self->m_iconvMutex);
2407 #endif
2408
2409         const wchar_t *wnul = L"";
2410         char buf[8]; // should be enough for NUL in any encoding
2411         size_t inLen = sizeof(wchar_t),
2412                outLen = WXSIZEOF(buf);
2413         char *inBuff = (char *)wnul;
2414         char *outBuff = buf;
2415         if ( iconv(w2m, ICONV_CHAR_CAST(&inBuff), &inLen, &outBuff, &outLen) == (size_t)-1 )
2416         {
2417             self->m_minMBCharWidth = (size_t)-1;
2418         }
2419         else // ok
2420         {
2421             self->m_minMBCharWidth = outBuff - buf;
2422         }
2423     }
2424
2425     return m_minMBCharWidth;
2426 }
2427
2428 #if wxUSE_UNICODE_UTF8
2429 bool wxMBConv_iconv::IsUTF8() const
2430 {
2431     return wxStricmp(m_name, "UTF-8") == 0 ||
2432            wxStricmp(m_name, "UTF8") == 0;
2433 }
2434 #endif
2435
2436 #endif // HAVE_ICONV
2437
2438
2439 // ============================================================================
2440 // Win32 conversion classes
2441 // ============================================================================
2442
2443 #ifdef wxHAVE_WIN32_MB2WC
2444
2445 // from utils.cpp
2446 #if wxUSE_FONTMAP
2447 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const char *charset);
2448 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
2449 #endif
2450
2451 class wxMBConv_win32 : public wxMBConv
2452 {
2453 public:
2454     wxMBConv_win32()
2455     {
2456         m_CodePage = CP_ACP;
2457         m_minMBCharWidth = 0;
2458     }
2459
2460     wxMBConv_win32(const wxMBConv_win32& conv)
2461         : wxMBConv()
2462     {
2463         m_CodePage = conv.m_CodePage;
2464         m_minMBCharWidth = conv.m_minMBCharWidth;
2465     }
2466
2467 #if wxUSE_FONTMAP
2468     wxMBConv_win32(const char* name)
2469     {
2470         m_CodePage = wxCharsetToCodepage(name);
2471         m_minMBCharWidth = 0;
2472     }
2473
2474     wxMBConv_win32(wxFontEncoding encoding)
2475     {
2476         m_CodePage = wxEncodingToCodepage(encoding);
2477         m_minMBCharWidth = 0;
2478     }
2479 #endif // wxUSE_FONTMAP
2480
2481     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2482     {
2483         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
2484         // the behaviour is not compatible with the Unix version (using iconv)
2485         // and break the library itself, e.g. wxTextInputStream::NextChar()
2486         // wouldn't work if reading an incomplete MB char didn't result in an
2487         // error
2488         //
2489         // Moreover, MB_ERR_INVALID_CHARS is only supported on Win 2K SP4 or
2490         // Win XP or newer and it is not supported for UTF-[78] so we always
2491         // use our own conversions in this case. See
2492         //     http://blogs.msdn.com/michkap/archive/2005/04/19/409566.aspx
2493         //     http://msdn.microsoft.com/library/en-us/intl/unicode_17si.asp
2494         if ( m_CodePage == CP_UTF8 )
2495         {
2496             return wxMBConvUTF8().MB2WC(buf, psz, n);
2497         }
2498
2499         if ( m_CodePage == CP_UTF7 )
2500         {
2501             return wxMBConvUTF7().MB2WC(buf, psz, n);
2502         }
2503
2504         int flags = 0;
2505         if ( (m_CodePage < 50000 && m_CodePage != CP_SYMBOL) &&
2506                 IsAtLeastWin2kSP4() )
2507         {
2508             flags = MB_ERR_INVALID_CHARS;
2509         }
2510
2511         const size_t len = ::MultiByteToWideChar
2512                              (
2513                                 m_CodePage,     // code page
2514                                 flags,          // flags: fall on error
2515                                 psz,            // input string
2516                                 -1,             // its length (NUL-terminated)
2517                                 buf,            // output string
2518                                 buf ? n : 0     // size of output buffer
2519                              );
2520         if ( !len )
2521         {
2522             // function totally failed
2523             return wxCONV_FAILED;
2524         }
2525
2526         // if we were really converting and didn't use MB_ERR_INVALID_CHARS,
2527         // check if we succeeded, by doing a double trip:
2528         if ( !flags && buf )
2529         {
2530             const size_t mbLen = strlen(psz);
2531             wxCharBuffer mbBuf(mbLen);
2532             if ( ::WideCharToMultiByte
2533                    (
2534                       m_CodePage,
2535                       0,
2536                       buf,
2537                       -1,
2538                       mbBuf.data(),
2539                       mbLen + 1,        // size in bytes, not length
2540                       NULL,
2541                       NULL
2542                    ) == 0 ||
2543                   strcmp(mbBuf, psz) != 0 )
2544             {
2545                 // we didn't obtain the same thing we started from, hence
2546                 // the conversion was lossy and we consider that it failed
2547                 return wxCONV_FAILED;
2548             }
2549         }
2550
2551         // note that it returns count of written chars for buf != NULL and size
2552         // of the needed buffer for buf == NULL so in either case the length of
2553         // the string (which never includes the terminating NUL) is one less
2554         return len - 1;
2555     }
2556
2557     virtual size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
2558     {
2559         /*
2560             we have a problem here: by default, WideCharToMultiByte() may
2561             replace characters unrepresentable in the target code page with bad
2562             quality approximations such as turning "1/2" symbol (U+00BD) into
2563             "1" for the code pages which don't have it and we, obviously, want
2564             to avoid this at any price
2565
2566             the trouble is that this function does it _silently_, i.e. it won't
2567             even tell us whether it did or not... Win98/2000 and higher provide
2568             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
2569             we have to resort to a round trip, i.e. check that converting back
2570             results in the same string -- this is, of course, expensive but
2571             otherwise we simply can't be sure to not garble the data.
2572          */
2573
2574         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
2575         // it doesn't work with CJK encodings (which we test for rather roughly
2576         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
2577         // supporting it
2578         BOOL usedDef wxDUMMY_INITIALIZE(false);
2579         BOOL *pUsedDef;
2580         int flags;
2581         if ( CanUseNoBestFit() && m_CodePage < 50000 )
2582         {
2583             // it's our lucky day
2584             flags = WC_NO_BEST_FIT_CHARS;
2585             pUsedDef = &usedDef;
2586         }
2587         else // old system or unsupported encoding
2588         {
2589             flags = 0;
2590             pUsedDef = NULL;
2591         }
2592
2593         const size_t len = ::WideCharToMultiByte
2594                              (
2595                                 m_CodePage,     // code page
2596                                 flags,          // either none or no best fit
2597                                 pwz,            // input string
2598                                 -1,             // it is (wide) NUL-terminated
2599                                 buf,            // output buffer
2600                                 buf ? n : 0,    // and its size
2601                                 NULL,           // default "replacement" char
2602                                 pUsedDef        // [out] was it used?
2603                              );
2604
2605         if ( !len )
2606         {
2607             // function totally failed
2608             return wxCONV_FAILED;
2609         }
2610
2611         // we did something, check if we really succeeded
2612         if ( flags )
2613         {
2614             // check if the conversion failed, i.e. if any replacements
2615             // were done
2616             if ( usedDef )
2617                 return wxCONV_FAILED;
2618         }
2619         else // we must resort to double tripping...
2620         {
2621             // first we need to ensure that we really have the MB data: this is
2622             // not the case if we're called with NULL buffer, in which case we
2623             // need to do the conversion yet again
2624             wxCharBuffer bufDef;
2625             if ( !buf )
2626             {
2627                 bufDef = wxCharBuffer(len);
2628                 buf = bufDef.data();
2629                 if ( !::WideCharToMultiByte(m_CodePage, flags, pwz, -1,
2630                                             buf, len, NULL, NULL) )
2631                     return wxCONV_FAILED;
2632             }
2633
2634             if ( !n )
2635                 n = wcslen(pwz);
2636             wxWCharBuffer wcBuf(n);
2637             if ( MB2WC(wcBuf.data(), buf, n + 1) == wxCONV_FAILED ||
2638                     wcscmp(wcBuf, pwz) != 0 )
2639             {
2640                 // we didn't obtain the same thing we started from, hence
2641                 // the conversion was lossy and we consider that it failed
2642                 return wxCONV_FAILED;
2643             }
2644         }
2645
2646         // see the comment above for the reason of "len - 1"
2647         return len - 1;
2648     }
2649
2650     virtual size_t GetMBNulLen() const
2651     {
2652         if ( m_minMBCharWidth == 0 )
2653         {
2654             int len = ::WideCharToMultiByte
2655                         (
2656                             m_CodePage,     // code page
2657                             0,              // no flags
2658                             L"",            // input string
2659                             1,              // translate just the NUL
2660                             NULL,           // output buffer
2661                             0,              // and its size
2662                             NULL,           // no replacement char
2663                             NULL            // [out] don't care if it was used
2664                         );
2665
2666             wxMBConv_win32 * const self = wxConstCast(this, wxMBConv_win32);
2667             switch ( len )
2668             {
2669                 default:
2670                     wxLogDebug(_T("Unexpected NUL length %d"), len);
2671                     self->m_minMBCharWidth = (size_t)-1;
2672                     break;
2673
2674                 case 0:
2675                     self->m_minMBCharWidth = (size_t)-1;
2676                     break;
2677
2678                 case 1:
2679                 case 2:
2680                 case 4:
2681                     self->m_minMBCharWidth = len;
2682                     break;
2683             }
2684         }
2685
2686         return m_minMBCharWidth;
2687     }
2688
2689     virtual wxMBConv *Clone() const { return new wxMBConv_win32(*this); }
2690
2691     bool IsOk() const { return m_CodePage != -1; }
2692
2693 private:
2694     static bool CanUseNoBestFit()
2695     {
2696         static int s_isWin98Or2k = -1;
2697
2698         if ( s_isWin98Or2k == -1 )
2699         {
2700             int verMaj, verMin;
2701             switch ( wxGetOsVersion(&verMaj, &verMin) )
2702             {
2703                 case wxOS_WINDOWS_9X:
2704                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
2705                     break;
2706
2707                 case wxOS_WINDOWS_NT:
2708                     s_isWin98Or2k = verMaj >= 5;
2709                     break;
2710
2711                 default:
2712                     // unknown: be conservative by default
2713                     s_isWin98Or2k = 0;
2714                     break;
2715             }
2716
2717             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
2718         }
2719
2720         return s_isWin98Or2k == 1;
2721     }
2722
2723     static bool IsAtLeastWin2kSP4()
2724     {
2725 #ifdef __WXWINCE__
2726         return false;
2727 #else
2728         static int s_isAtLeastWin2kSP4 = -1;
2729
2730         if ( s_isAtLeastWin2kSP4 == -1 )
2731         {
2732             OSVERSIONINFOEX ver;
2733
2734             memset(&ver, 0, sizeof(ver));
2735             ver.dwOSVersionInfoSize = sizeof(ver);
2736             GetVersionEx((OSVERSIONINFO*)&ver);
2737
2738             s_isAtLeastWin2kSP4 =
2739               ((ver.dwMajorVersion > 5) || // Vista+
2740                (ver.dwMajorVersion == 5 && ver.dwMinorVersion > 0) || // XP/2003
2741                (ver.dwMajorVersion == 5 && ver.dwMinorVersion == 0 &&
2742                ver.wServicePackMajor >= 4)) // 2000 SP4+
2743               ? 1 : 0;
2744         }
2745
2746         return s_isAtLeastWin2kSP4 == 1;
2747 #endif
2748     }
2749
2750
2751     // the code page we're working with
2752     long m_CodePage;
2753
2754     // cached result of GetMBNulLen(), set to 0 initially meaning
2755     // "unknown"
2756     size_t m_minMBCharWidth;
2757 };
2758
2759 #endif // wxHAVE_WIN32_MB2WC
2760
2761
2762 // ============================================================================
2763 // wxEncodingConverter based conversion classes
2764 // ============================================================================
2765
2766 #if wxUSE_FONTMAP
2767
2768 class wxMBConv_wxwin : public wxMBConv
2769 {
2770 private:
2771     void Init()
2772     {
2773         // Refuse to use broken wxEncodingConverter code for Mac-specific encodings.
2774         // The wxMBConv_cf class does a better job.
2775         m_ok = (m_enc < wxFONTENCODING_MACMIN || m_enc > wxFONTENCODING_MACMAX) &&
2776                m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2777                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2778     }
2779
2780 public:
2781     // temporarily just use wxEncodingConverter stuff,
2782     // so that it works while a better implementation is built
2783     wxMBConv_wxwin(const char* name)
2784     {
2785         if (name)
2786             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2787         else
2788             m_enc = wxFONTENCODING_SYSTEM;
2789
2790         Init();
2791     }
2792
2793     wxMBConv_wxwin(wxFontEncoding enc)
2794     {
2795         m_enc = enc;
2796
2797         Init();
2798     }
2799
2800     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2801     {
2802         size_t inbuf = strlen(psz);
2803         if (buf)
2804         {
2805             if (!m2w.Convert(psz, buf))
2806                 return wxCONV_FAILED;
2807         }
2808         return inbuf;
2809     }
2810
2811     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2812     {
2813         const size_t inbuf = wxWcslen(psz);
2814         if (buf)
2815         {
2816             if (!w2m.Convert(psz, buf))
2817                 return wxCONV_FAILED;
2818         }
2819
2820         return inbuf;
2821     }
2822
2823     virtual size_t GetMBNulLen() const
2824     {
2825         switch ( m_enc )
2826         {
2827             case wxFONTENCODING_UTF16BE:
2828             case wxFONTENCODING_UTF16LE:
2829                 return 2;
2830
2831             case wxFONTENCODING_UTF32BE:
2832             case wxFONTENCODING_UTF32LE:
2833                 return 4;
2834
2835             default:
2836                 return 1;
2837         }
2838     }
2839
2840     virtual wxMBConv *Clone() const { return new wxMBConv_wxwin(m_enc); }
2841
2842     bool IsOk() const { return m_ok; }
2843
2844 public:
2845     wxFontEncoding m_enc;
2846     wxEncodingConverter m2w, w2m;
2847
2848 private:
2849     // were we initialized successfully?
2850     bool m_ok;
2851
2852     wxDECLARE_NO_COPY_CLASS(wxMBConv_wxwin);
2853 };
2854
2855 // make the constructors available for unit testing
2856 WXDLLIMPEXP_BASE wxMBConv* new_wxMBConv_wxwin( const char* name )
2857 {
2858     wxMBConv_wxwin* result = new wxMBConv_wxwin( name );
2859     if ( !result->IsOk() )
2860     {
2861         delete result;
2862         return 0;
2863     }
2864
2865     return result;
2866 }
2867
2868 #endif // wxUSE_FONTMAP
2869
2870 // ============================================================================
2871 // wxCSConv implementation
2872 // ============================================================================
2873
2874 void wxCSConv::Init()
2875 {
2876     m_name = NULL;
2877     m_convReal =  NULL;
2878     m_deferred = true;
2879 }
2880
2881 wxCSConv::wxCSConv(const wxString& charset)
2882 {
2883     Init();
2884
2885     if ( !charset.empty() )
2886     {
2887         SetName(charset.ToAscii());
2888     }
2889
2890 #if wxUSE_FONTMAP
2891     m_encoding = wxFontMapperBase::GetEncodingFromName(charset);
2892     if ( m_encoding == wxFONTENCODING_MAX )
2893     {
2894         // set to unknown/invalid value
2895         m_encoding = wxFONTENCODING_SYSTEM;
2896     }
2897     else if ( m_encoding == wxFONTENCODING_DEFAULT )
2898     {
2899         // wxFONTENCODING_DEFAULT is same as US-ASCII in this context
2900         m_encoding = wxFONTENCODING_ISO8859_1;
2901     }
2902 #else
2903     m_encoding = wxFONTENCODING_SYSTEM;
2904 #endif
2905 }
2906
2907 wxCSConv::wxCSConv(wxFontEncoding encoding)
2908 {
2909     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2910     {
2911         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2912
2913         encoding = wxFONTENCODING_SYSTEM;
2914     }
2915
2916     Init();
2917
2918     m_encoding = encoding;
2919 }
2920
2921 wxCSConv::~wxCSConv()
2922 {
2923     Clear();
2924 }
2925
2926 wxCSConv::wxCSConv(const wxCSConv& conv)
2927         : wxMBConv()
2928 {
2929     Init();
2930
2931     SetName(conv.m_name);
2932     m_encoding = conv.m_encoding;
2933 }
2934
2935 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2936 {
2937     Clear();
2938
2939     SetName(conv.m_name);
2940     m_encoding = conv.m_encoding;
2941
2942     return *this;
2943 }
2944
2945 void wxCSConv::Clear()
2946 {
2947     free(m_name);
2948     delete m_convReal;
2949
2950     m_name = NULL;
2951     m_convReal = NULL;
2952 }
2953
2954 void wxCSConv::SetName(const char *charset)
2955 {
2956     if (charset)
2957     {
2958         m_name = wxStrdup(charset);
2959         m_deferred = true;
2960     }
2961 }
2962
2963 #if wxUSE_FONTMAP
2964
2965 WX_DECLARE_HASH_MAP( wxFontEncoding, wxString, wxIntegerHash, wxIntegerEqual,
2966                      wxEncodingNameCache );
2967
2968 static wxEncodingNameCache gs_nameCache;
2969 #endif
2970
2971 wxMBConv *wxCSConv::DoCreate() const
2972 {
2973 #if wxUSE_FONTMAP
2974     wxLogTrace(TRACE_STRCONV,
2975                wxT("creating conversion for %s"),
2976                (m_name ? m_name
2977                        : (const char*)wxFontMapperBase::GetEncodingName(m_encoding).mb_str()));
2978 #endif // wxUSE_FONTMAP
2979
2980     // check for the special case of ASCII or ISO8859-1 charset: as we have
2981     // special knowledge of it anyhow, we don't need to create a special
2982     // conversion object
2983     if ( m_encoding == wxFONTENCODING_ISO8859_1 ||
2984             m_encoding == wxFONTENCODING_DEFAULT )
2985     {
2986         // don't convert at all
2987         return NULL;
2988     }
2989
2990     // we trust OS to do conversion better than we can so try external
2991     // conversion methods first
2992     //
2993     // the full order is:
2994     //      1. OS conversion (iconv() under Unix or Win32 API)
2995     //      2. hard coded conversions for UTF
2996     //      3. wxEncodingConverter as fall back
2997
2998     // step (1)
2999 #ifdef HAVE_ICONV
3000 #if !wxUSE_FONTMAP
3001     if ( m_name )
3002 #endif // !wxUSE_FONTMAP
3003     {
3004 #if wxUSE_FONTMAP
3005         wxFontEncoding encoding(m_encoding);
3006 #endif
3007
3008         if ( m_name )
3009         {
3010             wxMBConv_iconv *conv = new wxMBConv_iconv(m_name);
3011             if ( conv->IsOk() )
3012                 return conv;
3013
3014             delete conv;
3015
3016 #if wxUSE_FONTMAP
3017             encoding =
3018                 wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3019 #endif // wxUSE_FONTMAP
3020         }
3021 #if wxUSE_FONTMAP
3022         {
3023             const wxEncodingNameCache::iterator it = gs_nameCache.find(encoding);
3024             if ( it != gs_nameCache.end() )
3025             {
3026                 if ( it->second.empty() )
3027                     return NULL;
3028
3029                 wxMBConv_iconv *conv = new wxMBConv_iconv(it->second.ToAscii());
3030                 if ( conv->IsOk() )
3031                     return conv;
3032
3033                 delete conv;
3034             }
3035
3036             const wxChar** names = wxFontMapperBase::GetAllEncodingNames(encoding);
3037             // CS : in case this does not return valid names (eg for MacRoman)
3038             // encoding got a 'failure' entry in the cache all the same,
3039             // although it just has to be created using a different method, so
3040             // only store failed iconv creation attempts (or perhaps we
3041             // shoulnd't do this at all ?)
3042             if ( names[0] != NULL )
3043             {
3044                 for ( ; *names; ++names )
3045                 {
3046                     // FIXME-UTF8: wxFontMapperBase::GetAllEncodingNames()
3047                     //             will need changes that will obsolete this
3048                     wxString name(*names);
3049                     wxMBConv_iconv *conv = new wxMBConv_iconv(name.ToAscii());
3050                     if ( conv->IsOk() )
3051                     {
3052                         gs_nameCache[encoding] = *names;
3053                         return conv;
3054                     }
3055
3056                     delete conv;
3057                 }
3058
3059                 gs_nameCache[encoding] = _T(""); // cache the failure
3060             }
3061         }
3062 #endif // wxUSE_FONTMAP
3063     }
3064 #endif // HAVE_ICONV
3065
3066 #ifdef wxHAVE_WIN32_MB2WC
3067     {
3068 #if wxUSE_FONTMAP
3069         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
3070                                       : new wxMBConv_win32(m_encoding);
3071         if ( conv->IsOk() )
3072             return conv;
3073
3074         delete conv;
3075 #else
3076         return NULL;
3077 #endif
3078     }
3079 #endif // wxHAVE_WIN32_MB2WC
3080
3081 #ifdef __DARWIN__
3082     {
3083         // leave UTF16 and UTF32 to the built-ins of wx
3084         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
3085             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
3086         {
3087 #if wxUSE_FONTMAP
3088             wxMBConv_cf *conv = m_name ? new wxMBConv_cf(m_name)
3089                                           : new wxMBConv_cf(m_encoding);
3090 #else
3091             wxMBConv_cf *conv = new wxMBConv_cf(m_encoding);
3092 #endif
3093
3094             if ( conv->IsOk() )
3095                  return conv;
3096
3097             delete conv;
3098         }
3099     }
3100 #endif // __DARWIN__
3101
3102     // step (2)
3103     wxFontEncoding enc = m_encoding;
3104 #if wxUSE_FONTMAP
3105     if ( enc == wxFONTENCODING_SYSTEM && m_name )
3106     {
3107         // use "false" to suppress interactive dialogs -- we can be called from
3108         // anywhere and popping up a dialog from here is the last thing we want to
3109         // do
3110         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
3111     }
3112 #endif // wxUSE_FONTMAP
3113
3114     switch ( enc )
3115     {
3116         case wxFONTENCODING_UTF7:
3117              return new wxMBConvUTF7;
3118
3119         case wxFONTENCODING_UTF8:
3120              return new wxMBConvUTF8;
3121
3122         case wxFONTENCODING_UTF16BE:
3123              return new wxMBConvUTF16BE;
3124
3125         case wxFONTENCODING_UTF16LE:
3126              return new wxMBConvUTF16LE;
3127
3128         case wxFONTENCODING_UTF32BE:
3129              return new wxMBConvUTF32BE;
3130
3131         case wxFONTENCODING_UTF32LE:
3132              return new wxMBConvUTF32LE;
3133
3134         default:
3135              // nothing to do but put here to suppress gcc warnings
3136              break;
3137     }
3138
3139     // step (3)
3140 #if wxUSE_FONTMAP
3141     {
3142         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
3143                                       : new wxMBConv_wxwin(m_encoding);
3144         if ( conv->IsOk() )
3145             return conv;
3146
3147         delete conv;
3148     }
3149 #endif // wxUSE_FONTMAP
3150
3151     // NB: This is a hack to prevent deadlock. What could otherwise happen
3152     //     in Unicode build: wxConvLocal creation ends up being here
3153     //     because of some failure and logs the error. But wxLog will try to
3154     //     attach a timestamp, for which it will need wxConvLocal (to convert
3155     //     time to char* and then wchar_t*), but that fails, tries to log the
3156     //     error, but wxLog has an (already locked) critical section that
3157     //     guards the static buffer.
3158     static bool alreadyLoggingError = false;
3159     if (!alreadyLoggingError)
3160     {
3161         alreadyLoggingError = true;
3162         wxLogError(_("Cannot convert from the charset '%s'!"),
3163                    m_name ? m_name
3164                       :
3165 #if wxUSE_FONTMAP
3166                          (const char*)wxFontMapperBase::GetEncodingDescription(m_encoding).ToAscii()
3167 #else // !wxUSE_FONTMAP
3168                          (const char*)wxString::Format(_("encoding %i"), m_encoding).ToAscii()
3169 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
3170               );
3171
3172         alreadyLoggingError = false;
3173     }
3174
3175     return NULL;
3176 }
3177
3178 void wxCSConv::CreateConvIfNeeded() const
3179 {
3180     if ( m_deferred )
3181     {
3182         wxCSConv *self = (wxCSConv *)this; // const_cast
3183
3184         // if we don't have neither the name nor the encoding, use the default
3185         // encoding for this system
3186         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
3187         {
3188 #if wxUSE_INTL
3189             self->m_encoding = wxLocale::GetSystemEncoding();
3190 #else
3191             // fallback to some reasonable default:
3192             self->m_encoding = wxFONTENCODING_ISO8859_1;
3193 #endif // wxUSE_INTL
3194         }
3195
3196         self->m_convReal = DoCreate();
3197         self->m_deferred = false;
3198     }
3199 }
3200
3201 bool wxCSConv::IsOk() const
3202 {
3203     CreateConvIfNeeded();
3204
3205     // special case: no convReal created for wxFONTENCODING_ISO8859_1
3206     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
3207         return true; // always ok as we do it ourselves
3208
3209     // m_convReal->IsOk() is called at its own creation, so we know it must
3210     // be ok if m_convReal is non-NULL
3211     return m_convReal != NULL;
3212 }
3213
3214 size_t wxCSConv::ToWChar(wchar_t *dst, size_t dstLen,
3215                          const char *src, size_t srcLen) const
3216 {
3217     CreateConvIfNeeded();
3218
3219     if (m_convReal)
3220         return m_convReal->ToWChar(dst, dstLen, src, srcLen);
3221
3222     // latin-1 (direct)
3223     if ( srcLen == wxNO_LEN )
3224         srcLen = strlen(src) + 1; // take trailing NUL too
3225
3226     if ( dst )
3227     {
3228         if ( dstLen < srcLen )
3229             return wxCONV_FAILED;
3230
3231         for ( size_t n = 0; n < srcLen; n++ )
3232             dst[n] = (unsigned char)(src[n]);
3233     }
3234
3235     return srcLen;
3236 }
3237
3238 size_t wxCSConv::FromWChar(char *dst, size_t dstLen,
3239                            const wchar_t *src, size_t srcLen) const
3240 {
3241     CreateConvIfNeeded();
3242
3243     if (m_convReal)
3244         return m_convReal->FromWChar(dst, dstLen, src, srcLen);
3245
3246     // latin-1 (direct)
3247     if ( srcLen == wxNO_LEN )
3248         srcLen = wxWcslen(src) + 1;
3249
3250     if ( dst )
3251     {
3252         if ( dstLen < srcLen )
3253             return wxCONV_FAILED;
3254
3255         for ( size_t n = 0; n < srcLen; n++ )
3256         {
3257             if ( src[n] > 0xFF )
3258                 return wxCONV_FAILED;
3259
3260             dst[n] = (char)src[n];
3261         }
3262
3263     }
3264     else // still need to check the input validity
3265     {
3266         for ( size_t n = 0; n < srcLen; n++ )
3267         {
3268             if ( src[n] > 0xFF )
3269                 return wxCONV_FAILED;
3270         }
3271     }
3272
3273     return srcLen;
3274 }
3275
3276 size_t wxCSConv::GetMBNulLen() const
3277 {
3278     CreateConvIfNeeded();
3279
3280     if ( m_convReal )
3281     {
3282         return m_convReal->GetMBNulLen();
3283     }
3284
3285     // otherwise, we are ISO-8859-1
3286     return 1;
3287 }
3288
3289 #if wxUSE_UNICODE_UTF8
3290 bool wxCSConv::IsUTF8() const
3291 {
3292     CreateConvIfNeeded();
3293
3294     if ( m_convReal )
3295     {
3296         return m_convReal->IsUTF8();
3297     }
3298
3299     // otherwise, we are ISO-8859-1
3300     return false;
3301 }
3302 #endif
3303
3304
3305 #if wxUSE_UNICODE
3306
3307 wxWCharBuffer wxSafeConvertMB2WX(const char *s)
3308 {
3309     if ( !s )
3310         return wxWCharBuffer();
3311
3312     wxWCharBuffer wbuf(wxConvLibc.cMB2WX(s));
3313     if ( !wbuf )
3314         wbuf = wxMBConvUTF8().cMB2WX(s);
3315     if ( !wbuf )
3316         wbuf = wxConvISO8859_1.cMB2WX(s);
3317
3318     return wbuf;
3319 }
3320
3321 wxCharBuffer wxSafeConvertWX2MB(const wchar_t *ws)
3322 {
3323     if ( !ws )
3324         return wxCharBuffer();
3325
3326     wxCharBuffer buf(wxConvLibc.cWX2MB(ws));
3327     if ( !buf )
3328         buf = wxMBConvUTF8(wxMBConvUTF8::MAP_INVALID_UTF8_TO_OCTAL).cWX2MB(ws);
3329
3330     return buf;
3331 }
3332
3333 #endif // wxUSE_UNICODE
3334
3335 // ----------------------------------------------------------------------------
3336 // globals
3337 // ----------------------------------------------------------------------------
3338
3339 // NB: The reason why we create converted objects in this convoluted way,
3340 //     using a factory function instead of global variable, is that they
3341 //     may be used at static initialization time (some of them are used by
3342 //     wxString ctors and there may be a global wxString object). In other
3343 //     words, possibly _before_ the converter global object would be
3344 //     initialized.
3345
3346 #undef wxConvLibc
3347 #undef wxConvUTF8
3348 #undef wxConvUTF7
3349 #undef wxConvLocal
3350 #undef wxConvISO8859_1
3351
3352 #define WX_DEFINE_GLOBAL_CONV2(klass, impl_klass, name, ctor_args)      \
3353     WXDLLIMPEXP_DATA_BASE(klass*) name##Ptr = NULL;                     \
3354     WXDLLIMPEXP_BASE klass* wxGet_##name##Ptr()                         \
3355     {                                                                   \
3356         static impl_klass name##Obj ctor_args;                          \
3357         return &name##Obj;                                              \
3358     }                                                                   \
3359     /* this ensures that all global converter objects are created */    \
3360     /* by the time static initialization is done, i.e. before any */    \
3361     /* thread is launched: */                                           \
3362     static klass* gs_##name##instance = wxGet_##name##Ptr()
3363
3364 #define WX_DEFINE_GLOBAL_CONV(klass, name, ctor_args) \
3365     WX_DEFINE_GLOBAL_CONV2(klass, klass, name, ctor_args)
3366
3367 #ifdef __INTELC__
3368     // disable warning "variable 'xxx' was declared but never referenced"
3369     #pragma warning(disable: 177)
3370 #endif // Intel C++
3371
3372 #ifdef __WINDOWS__
3373     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_win32, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3374 #elif 0 // defined(__WXOSX__)
3375     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConv_cf, wxConvLibc,  (wxFONTENCODING_UTF8));
3376 #else
3377     WX_DEFINE_GLOBAL_CONV2(wxMBConv, wxMBConvLibc, wxConvLibc, wxEMPTY_PARAMETER_VALUE);
3378 #endif
3379
3380 // NB: we can't use wxEMPTY_PARAMETER_VALUE as final argument here because it's
3381 //     passed to WX_DEFINE_GLOBAL_CONV2 after a macro expansion and so still
3382 //     provokes an error message about "not enough macro parameters"; and we
3383 //     can't use "()" here as the name##Obj declaration would be parsed as a
3384 //     function declaration then, so use a semicolon and live with an extra
3385 //     empty statement (and hope that no compilers warns about this)
3386 WX_DEFINE_GLOBAL_CONV(wxMBConvStrictUTF8, wxConvUTF8, ;);
3387 WX_DEFINE_GLOBAL_CONV(wxMBConvUTF7, wxConvUTF7, ;);
3388
3389 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvLocal, (wxFONTENCODING_SYSTEM));
3390 WX_DEFINE_GLOBAL_CONV(wxCSConv, wxConvISO8859_1, (wxFONTENCODING_ISO8859_1));
3391
3392 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = wxGet_wxConvLibcPtr();
3393 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvUI = wxGet_wxConvLocalPtr();
3394
3395 #ifdef __DARWIN__
3396 // The xnu kernel always communicates file paths in decomposed UTF-8.
3397 // WARNING: Are we sure that CFString's conversion will cause decomposition?
3398 static wxMBConv_cf wxConvMacUTF8DObj(wxFONTENCODING_UTF8);
3399 #endif
3400
3401 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName =
3402 #ifdef __DARWIN__
3403                                     &wxConvMacUTF8DObj;
3404 #else // !__DARWIN__
3405                                     wxGet_wxConvLibcPtr();
3406 #endif // __DARWIN__/!__DARWIN__
3407
3408 #else // !wxUSE_WCHAR_T
3409
3410 // FIXME-UTF8: remove this, wxUSE_WCHAR_T is required now
3411 // stand-ins in absence of wchar_t
3412 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
3413                                 wxConvISO8859_1,
3414                                 wxConvLocal,
3415                                 wxConvUTF8;
3416
3417 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T