src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
  24   #pragma implementation "strconv.h"
  25 #endif
  26
  27 // For compilers that support precompilation, includes "wx.h".
  28 #include "wx/wxprec.h"
  29
  30 #ifdef __BORLANDC__
  31   #pragma hdrstop
  32 #endif
  33
  34 #ifndef WX_PRECOMP
  35     #include "wx/intl.h"
  36     #include "wx/log.h"
  37 #endif // WX_PRECOMP
  38
  39 #include "wx/strconv.h"
  40
  41 #if wxUSE_WCHAR_T
  42
  43 #ifdef __WXMSW__
  44     #include "wx/msw/private.h"
  45 #endif
  46
  47 #ifdef __WINDOWS__
  48     #include "wx/msw/missing.h"
  49 #endif
  50
  51 #ifndef __WXWINCE__
  52 #include <errno.h>
  53 #endif
  54
  55 #include <ctype.h>
  56 #include <string.h>
  57 #include <stdlib.h>
  58
  59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  60     #define wxHAVE_WIN32_MB2WC
  61 #endif // __WIN32__ but !__WXMICROWIN__
  62
  63 // ----------------------------------------------------------------------------
  64 // headers
  65 // ----------------------------------------------------------------------------
  66
  67 #ifdef __SALFORDC__
  68     #include <clib.h>
  69 #endif
  70
  71 #ifdef HAVE_ICONV
  72     #include <iconv.h>
  73     #include "wx/thread.h"
  74 #endif
  75
  76 #include "wx/encconv.h"
  77 #include "wx/fontmap.h"
  78 #include "wx/utils.h"
  79
  80 #ifdef __WXMAC__
  81 #include <ATSUnicode.h>
  82 #include <TextCommon.h>
  83 #include <TextEncodingConverter.h>
  84
  85 #include  "wx/mac/private.h"  // includes mac headers
  86 #endif
  87 // ----------------------------------------------------------------------------
  88 // macros
  89 // ----------------------------------------------------------------------------
  90
  91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
  92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
  93
  94 #if SIZEOF_WCHAR_T == 4
  95     #define WC_NAME         "UCS4"
  96     #define WC_BSWAP         BSWAP_UCS4
  97     #ifdef WORDS_BIGENDIAN
  98       #define WC_NAME_BEST  "UCS-4BE"
  99     #else
 100       #define WC_NAME_BEST  "UCS-4LE"
 101     #endif
 102 #elif SIZEOF_WCHAR_T == 2
 103     #define WC_NAME         "UTF16"
 104     #define WC_BSWAP         BSWAP_UTF16
 105     #define WC_UTF16
 106     #ifdef WORDS_BIGENDIAN
 107       #define WC_NAME_BEST  "UTF-16BE"
 108     #else
 109       #define WC_NAME_BEST  "UTF-16LE"
 110     #endif
 111 #else // sizeof(wchar_t) != 2 nor 4
 112     // does this ever happen?
 113     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
 114 #endif
 115
 116 // ============================================================================
 117 // implementation
 118 // ============================================================================
 119
 120 // ----------------------------------------------------------------------------
 121 // UTF-16 en/decoding to/from UCS-4
 122 // ----------------------------------------------------------------------------
 123
 124
 125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 126 {
 127     if (input<=0xffff)
 128     {
 129         if (output)
 130             *output = (wxUint16) input;
 131         return 1;
 132     }
 133     else if (input>=0x110000)
 134     {
 135         return (size_t)-1;
 136     }
 137     else
 138     {
 139         if (output)
 140         {
 141             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 142             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 143         }
 144         return 2;
 145     }
 146 }
 147
 148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 149 {
 150     if ((*input<0xd800) || (*input>0xdfff))
 151     {
 152         output = *input;
 153         return 1;
 154     }
 155     else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
 156     {
 157         output = *input;
 158         return (size_t)-1;
 159     }
 160     else
 161     {
 162         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 163         return 2;
 164     }
 165 }
 166
 167
 168 // ----------------------------------------------------------------------------
 169 // wxMBConv
 170 // ----------------------------------------------------------------------------
 171
 172 wxMBConv::~wxMBConv()
 173 {
 174     // nothing to do here (necessary for Darwin linking probably)
 175 }
 176
 177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 178 {
 179     if ( psz )
 180     {
 181         // calculate the length of the buffer needed first
 182         size_t nLen = MB2WC(NULL, psz, 0);
 183         if ( nLen != (size_t)-1 )
 184         {
 185             // now do the actual conversion
 186             wxWCharBuffer buf(nLen);
 187             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 188             if ( nLen != (size_t)-1 )
 189             {
 190                 return buf;
 191             }
 192         }
 193     }
 194
 195     wxWCharBuffer buf((wchar_t *)NULL);
 196
 197     return buf;
 198 }
 199
 200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 201 {
 202     if ( pwz )
 203     {
 204         size_t nLen = WC2MB(NULL, pwz, 0);
 205         if ( nLen != (size_t)-1 )
 206         {
 207             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 208             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 209             if ( nLen != (size_t)-1 )
 210             {
 211                 return buf;
 212             }
 213         }
 214     }
 215
 216     wxCharBuffer buf((char *)NULL);
 217
 218     return buf;
 219 }
 220
 221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
 222 {
 223     wxASSERT(pOutSize != NULL);
 224
 225     const char* szEnd = szString + nStringLen + 1;
 226     const char* szPos = szString;
 227     const char* szStart = szPos;
 228
 229     size_t nActualLength = 0;
 230     size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
 231
 232     wxWCharBuffer theBuffer(nCurrentSize);
 233
 234     //Convert the string until the length() is reached, continuing the
 235     //loop every time a null character is reached
 236     while(szPos != szEnd)
 237     {
 238         wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
 239
 240         //Get the length of the current (sub)string
 241         size_t nLen = MB2WC(NULL, szPos, 0);
 242
 243         //Invalid conversion?
 244         if( nLen == (size_t)-1 )
 245         {
 246             *pOutSize = 0;
 247             theBuffer.data()[0u] = wxT('\0');
 248             return theBuffer;
 249         }
 250
 251
 252         //Increase the actual length (+1 for current null character)
 253         nActualLength += nLen + 1;
 254
 255         //if buffer too big, realloc the buffer
 256         if (nActualLength > (nCurrentSize+1))
 257         {
 258             wxWCharBuffer theNewBuffer(nCurrentSize << 1);
 259             memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
 260             theBuffer = theNewBuffer;
 261             nCurrentSize <<= 1;
 262         }
 263
 264         //Convert the current (sub)string
 265         if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
 266         {
 267             *pOutSize = 0;
 268             theBuffer.data()[0u] = wxT('\0');
 269             return theBuffer;
 270         }
 271
 272         //Increment to next (sub)string
 273         //Note that we have to use strlen here instead of nLen
 274         //here because XX2XX gives us the size of the output buffer,
 275         //not neccessarly the length of the string
 276         szPos += strlen(szPos) + 1;
 277     }
 278
 279     //success - return actual length and the buffer
 280     *pOutSize = nActualLength;
 281     return theBuffer;
 282 }
 283
 284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
 285 {
 286     wxASSERT(pOutSize != NULL);
 287
 288     const wchar_t* szEnd = szString + nStringLen + 1;
 289     const wchar_t* szPos = szString;
 290     const wchar_t* szStart = szPos;
 291
 292     size_t nActualLength = 0;
 293     size_t nCurrentSize = nStringLen << 2; //try * 4 first
 294
 295     wxCharBuffer theBuffer(nCurrentSize);
 296
 297     //Convert the string until the length() is reached, continuing the
 298     //loop every time a null character is reached
 299     while(szPos != szEnd)
 300     {
 301         wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
 302
 303         //Get the length of the current (sub)string
 304         size_t nLen = WC2MB(NULL, szPos, 0);
 305
 306         //Invalid conversion?
 307         if( nLen == (size_t)-1 )
 308         {
 309             *pOutSize = 0;
 310             theBuffer.data()[0u] = wxT('\0');
 311             return theBuffer;
 312         }
 313
 314         //Increase the actual length (+1 for current null character)
 315         nActualLength += nLen + 1;
 316
 317         //if buffer too big, realloc the buffer
 318         if (nActualLength > (nCurrentSize+1))
 319         {
 320             wxCharBuffer theNewBuffer(nCurrentSize << 1);
 321             memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
 322             theBuffer = theNewBuffer;
 323             nCurrentSize <<= 1;
 324         }
 325
 326         //Convert the current (sub)string
 327         if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
 328         {
 329             *pOutSize = 0;
 330             theBuffer.data()[0u] = wxT('\0');
 331             return theBuffer;
 332         }
 333
 334         //Increment to next (sub)string
 335         //Note that we have to use wxWcslen here instead of nLen
 336         //here because XX2XX gives us the size of the output buffer,
 337         //not neccessarly the length of the string
 338         szPos += wxWcslen(szPos) + 1;
 339     }
 340
 341     //success - return actual length and the buffer
 342     *pOutSize = nActualLength;
 343     return theBuffer;
 344 }
 345
 346 // ----------------------------------------------------------------------------
 347 // wxMBConvLibc
 348 // ----------------------------------------------------------------------------
 349
 350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 351 {
 352     return wxMB2WC(buf, psz, n);
 353 }
 354
 355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 356 {
 357     return wxWC2MB(buf, psz, n);
 358 }
 359
 360 // ----------------------------------------------------------------------------
 361 // wxConvBrokenFileNames is made for GTK2 in Unicode mode when
 362 // files are accidentally written in an encoding which is not
 363 // the system encoding. Typically, the system encoding will be
 364 // UTF8 but there might be files stored in ISO8859-1 in disk.
 365 // ----------------------------------------------------------------------------
 366
 367 class wxConvBrokenFileNames: public wxMBConvLibc
 368 {
 369 public:
 370     virtual size_t MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const;
 371     virtual size_t WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const;
 372 };
 373
 374 size_t wxConvBrokenFileNames::MB2WC(wchar_t *outputBuf, const char *psz, size_t outputSize) const
 375 {
 376 #if 0
 377     if (we find some invalid characters)
 378     {
 379        Convert to Unicode range.
 380     }
 381     else
 382 #endif
 383     return wxMBConvLibc::MB2WC( outputBuf, psz, outputSize );
 384 }
 385
 386 size_t wxConvBrokenFileNames::WC2MB(char *outputBuf, const wchar_t *psz, size_t outputSize) const
 387 {
 388 #if 0
 389     Convert back from Unicode range.
 390 #endif
 391     return wxMBConvLibc::WC2MB( outputBuf, psz, outputSize );
 392 }
 393
 394 // ----------------------------------------------------------------------------
 395 // UTF-7
 396 // ----------------------------------------------------------------------------
 397
 398 // Implementation (C) 2004 Fredrik Roubert
 399
 400 //
 401 // BASE64 decoding table
 402 //
 403 static const unsigned char utf7unb64[] =
 404 {
 405     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 406     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 407     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 408     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 409     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 410     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 411     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 412     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 413     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 414     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 415     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 416     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 417     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 418     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 419     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 420     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 421     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 422     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 423     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 424     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 425     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 426     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 427     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 428     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 429     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 430     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 431     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 432     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 433     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 434     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 435     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 436     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 437 };
 438
 439 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 440 {
 441     size_t len = 0;
 442
 443     while (*psz && ((!buf) || (len < n)))
 444     {
 445         unsigned char cc = *psz++;
 446         if (cc != '+')
 447         {
 448             // plain ASCII char
 449             if (buf)
 450                 *buf++ = cc;
 451             len++;
 452         }
 453         else if (*psz == '-')
 454         {
 455             // encoded plus sign
 456             if (buf)
 457                 *buf++ = cc;
 458             len++;
 459             psz++;
 460         }
 461         else
 462         {
 463             // BASE64 encoded string
 464             bool lsb;
 465             unsigned char c;
 466             unsigned int d, l;
 467             for (lsb = false, d = 0, l = 0;
 468                 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
 469             {
 470                 d <<= 6;
 471                 d += cc;
 472                 for (l += 6; l >= 8; lsb = !lsb)
 473                 {
 474                     c = (unsigned char)((d >> (l -= 8)) % 256);
 475                     if (lsb)
 476                     {
 477                         if (buf)
 478                             *buf++ |= c;
 479                         len ++;
 480                     }
 481                     else
 482                         if (buf)
 483                             *buf = (wchar_t)(c << 8);
 484                 }
 485             }
 486             if (*psz == '-')
 487                 psz++;
 488         }
 489     }
 490     if (buf && (len < n))
 491         *buf = 0;
 492     return len;
 493 }
 494
 495 //
 496 // BASE64 encoding table
 497 //
 498 static const unsigned char utf7enb64[] =
 499 {
 500     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 501     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 502     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 503     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 504     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 505     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 506     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 507     '4', '5', '6', '7', '8', '9', '+', '/'
 508 };
 509
 510 //
 511 // UTF-7 encoding table
 512 //
 513 // 0 - Set D (directly encoded characters)
 514 // 1 - Set O (optional direct characters)
 515 // 2 - whitespace characters (optional)
 516 // 3 - special characters
 517 //
 518 static const unsigned char utf7encode[128] =
 519 {
 520     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 521     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 522     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 523     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 524     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 525     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 526     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 527     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 528 };
 529
 530 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 531 {
 532
 533
 534     size_t len = 0;
 535
 536     while (*psz && ((!buf) || (len < n)))
 537     {
 538         wchar_t cc = *psz++;
 539         if (cc < 0x80 && utf7encode[cc] < 1)
 540         {
 541             // plain ASCII char
 542             if (buf)
 543                 *buf++ = (char)cc;
 544             len++;
 545         }
 546 #ifndef WC_UTF16
 547         else if (((wxUint32)cc) > 0xffff)
 548             {
 549             // no surrogate pair generation (yet?)
 550             return (size_t)-1;
 551         }
 552 #endif
 553         else
 554         {
 555             if (buf)
 556                 *buf++ = '+';
 557             len++;
 558             if (cc != '+')
 559             {
 560                 // BASE64 encode string
 561                 unsigned int lsb, d, l;
 562                 for (d = 0, l = 0;; psz++)
 563                 {
 564                     for (lsb = 0; lsb < 2; lsb ++)
 565                     {
 566                         d <<= 8;
 567                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 568
 569                         for (l += 8; l >= 6; )
 570                         {
 571                             l -= 6;
 572                             if (buf)
 573                                 *buf++ = utf7enb64[(d >> l) % 64];
 574                             len++;
 575                         }
 576                     }
 577                     cc = *psz;
 578                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 579                         break;
 580                 }
 581                 if (l != 0)
 582                 {
 583                     if (buf)
 584                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 585                     len++;
 586                 }
 587             }
 588             if (buf)
 589                 *buf++ = '-';
 590             len++;
 591         }
 592     }
 593     if (buf && (len < n))
 594         *buf = 0;
 595     return len;
 596 }
 597
 598 // ----------------------------------------------------------------------------
 599 // UTF-8
 600 // ----------------------------------------------------------------------------
 601
 602 static wxUint32 utf8_max[]=
 603     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 604
 605 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 606 {
 607     size_t len = 0;
 608
 609     while (*psz && ((!buf) || (len < n)))
 610     {
 611         unsigned char cc = *psz++, fc = cc;
 612         unsigned cnt;
 613         for (cnt = 0; fc & 0x80; cnt++)
 614             fc <<= 1;
 615         if (!cnt)
 616         {
 617             // plain ASCII char
 618             if (buf)
 619                 *buf++ = cc;
 620             len++;
 621         }
 622         else
 623         {
 624             cnt--;
 625             if (!cnt)
 626             {
 627                 // invalid UTF-8 sequence
 628                 return (size_t)-1;
 629             }
 630             else
 631             {
 632                 unsigned ocnt = cnt - 1;
 633                 wxUint32 res = cc & (0x3f >> cnt);
 634                 while (cnt--)
 635                 {
 636                     cc = *psz++;
 637                     if ((cc & 0xC0) != 0x80)
 638                     {
 639                         // invalid UTF-8 sequence
 640                         return (size_t)-1;
 641                     }
 642                     res = (res << 6) | (cc & 0x3f);
 643                 }
 644                 if (res <= utf8_max[ocnt])
 645                 {
 646                     // illegal UTF-8 encoding
 647                     return (size_t)-1;
 648                 }
 649 #ifdef WC_UTF16
 650                 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 651                 size_t pa = encode_utf16(res, (wxUint16 *)buf);
 652                 if (pa == (size_t)-1)
 653                   return (size_t)-1;
 654                 if (buf)
 655                     buf += pa;
 656                 len += pa;
 657 #else // !WC_UTF16
 658                 if (buf)
 659                     *buf++ = res;
 660                 len++;
 661 #endif // WC_UTF16/!WC_UTF16
 662             }
 663         }
 664     }
 665     if (buf && (len < n))
 666         *buf = 0;
 667     return len;
 668 }
 669
 670 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 671 {
 672     size_t len = 0;
 673
 674     while (*psz && ((!buf) || (len < n)))
 675     {
 676         wxUint32 cc;
 677 #ifdef WC_UTF16
 678         // cast is ok for WC_UTF16
 679         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 680         psz += (pa == (size_t)-1) ? 1 : pa;
 681 #else
 682         cc=(*psz++) & 0x7fffffff;
 683 #endif
 684         unsigned cnt;
 685         for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 686         if (!cnt)
 687         {
 688             // plain ASCII char
 689             if (buf)
 690                 *buf++ = (char) cc;
 691             len++;
 692         }
 693
 694         else
 695         {
 696             len += cnt + 1;
 697             if (buf)
 698             {
 699                 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 700                 while (cnt--)
 701                     *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 702             }
 703         }
 704     }
 705
 706     if (buf && (len<n)) *buf = 0;
 707
 708     return len;
 709 }
 710
 711
 712
 713
 714 // ----------------------------------------------------------------------------
 715 // UTF-16
 716 // ----------------------------------------------------------------------------
 717
 718 #ifdef WORDS_BIGENDIAN
 719     #define wxMBConvUTF16straight wxMBConvUTF16BE
 720     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 721 #else
 722     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 723     #define wxMBConvUTF16straight wxMBConvUTF16LE
 724 #endif
 725
 726
 727 #ifdef WC_UTF16
 728
 729 // copy 16bit MB to 16bit String
 730 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 731 {
 732     size_t len=0;
 733
 734     while (*(wxUint16*)psz && (!buf || len < n))
 735     {
 736         if (buf)
 737             *buf++ = *(wxUint16*)psz;
 738         len++;
 739
 740         psz += sizeof(wxUint16);
 741     }
 742     if (buf && len<n)   *buf=0;
 743
 744     return len;
 745 }
 746
 747
 748 // copy 16bit String to 16bit MB
 749 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 750 {
 751     size_t len=0;
 752
 753     while (*psz && (!buf || len < n))
 754     {
 755         if (buf)
 756         {
 757             *(wxUint16*)buf = *psz;
 758             buf += sizeof(wxUint16);
 759         }
 760         len += sizeof(wxUint16);
 761         psz++;
 762     }
 763     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 764
 765     return len;
 766 }
 767
 768
 769 // swap 16bit MB to 16bit String
 770 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 771 {
 772     size_t len=0;
 773
 774     while (*(wxUint16*)psz && (!buf || len < n))
 775     {
 776         if (buf)
 777         {
 778             ((char *)buf)[0] = psz[1];
 779             ((char *)buf)[1] = psz[0];
 780             buf++;
 781         }
 782         len++;
 783         psz += sizeof(wxUint16);
 784     }
 785     if (buf && len<n)   *buf=0;
 786
 787     return len;
 788 }
 789
 790
 791 // swap 16bit MB to 16bit String
 792 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 793 {
 794     size_t len=0;
 795
 796     while (*psz && (!buf || len < n))
 797     {
 798         if (buf)
 799         {
 800             *buf++ = ((char*)psz)[1];
 801             *buf++ = ((char*)psz)[0];
 802         }
 803         len += sizeof(wxUint16);
 804         psz++;
 805     }
 806     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 807
 808     return len;
 809 }
 810
 811
 812 #else // WC_UTF16
 813
 814
 815 // copy 16bit MB to 32bit String
 816 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 817 {
 818     size_t len=0;
 819
 820     while (*(wxUint16*)psz && (!buf || len < n))
 821     {
 822         wxUint32 cc;
 823         size_t pa=decode_utf16((wxUint16*)psz, cc);
 824         if (pa == (size_t)-1)
 825             return pa;
 826
 827         if (buf)
 828             *buf++ = cc;
 829         len++;
 830         psz += pa * sizeof(wxUint16);
 831     }
 832     if (buf && len<n)   *buf=0;
 833
 834     return len;
 835 }
 836
 837
 838 // copy 32bit String to 16bit MB
 839 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 840 {
 841     size_t len=0;
 842
 843     while (*psz && (!buf || len < n))
 844     {
 845         wxUint16 cc[2];
 846         size_t pa=encode_utf16(*psz, cc);
 847
 848         if (pa == (size_t)-1)
 849             return pa;
 850
 851         if (buf)
 852         {
 853             *(wxUint16*)buf = cc[0];
 854             buf += sizeof(wxUint16);
 855             if (pa > 1)
 856             {
 857                 *(wxUint16*)buf = cc[1];
 858                 buf += sizeof(wxUint16);
 859             }
 860         }
 861
 862         len += pa*sizeof(wxUint16);
 863         psz++;
 864     }
 865     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 866
 867     return len;
 868 }
 869
 870
 871 // swap 16bit MB to 32bit String
 872 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 873 {
 874     size_t len=0;
 875
 876     while (*(wxUint16*)psz && (!buf || len < n))
 877     {
 878         wxUint32 cc;
 879         char tmp[4];
 880         tmp[0]=psz[1];  tmp[1]=psz[0];
 881         tmp[2]=psz[3];  tmp[3]=psz[2];
 882
 883         size_t pa=decode_utf16((wxUint16*)tmp, cc);
 884         if (pa == (size_t)-1)
 885             return pa;
 886
 887         if (buf)
 888             *buf++ = cc;
 889
 890         len++;
 891         psz += pa * sizeof(wxUint16);
 892     }
 893     if (buf && len<n)   *buf=0;
 894
 895     return len;
 896 }
 897
 898
 899 // swap 32bit String to 16bit MB
 900 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 901 {
 902     size_t len=0;
 903
 904     while (*psz && (!buf || len < n))
 905     {
 906         wxUint16 cc[2];
 907         size_t pa=encode_utf16(*psz, cc);
 908
 909         if (pa == (size_t)-1)
 910             return pa;
 911
 912         if (buf)
 913         {
 914             *buf++ = ((char*)cc)[1];
 915             *buf++ = ((char*)cc)[0];
 916             if (pa > 1)
 917             {
 918                 *buf++ = ((char*)cc)[3];
 919                 *buf++ = ((char*)cc)[2];
 920             }
 921         }
 922
 923         len += pa*sizeof(wxUint16);
 924         psz++;
 925     }
 926     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 927
 928     return len;
 929 }
 930
 931 #endif // WC_UTF16
 932
 933
 934 // ----------------------------------------------------------------------------
 935 // UTF-32
 936 // ----------------------------------------------------------------------------
 937
 938 #ifdef WORDS_BIGENDIAN
 939 #define wxMBConvUTF32straight  wxMBConvUTF32BE
 940 #define wxMBConvUTF32swap      wxMBConvUTF32LE
 941 #else
 942 #define wxMBConvUTF32swap      wxMBConvUTF32BE
 943 #define wxMBConvUTF32straight  wxMBConvUTF32LE
 944 #endif
 945
 946
 947 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
 948 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
 949
 950
 951 #ifdef WC_UTF16
 952
 953 // copy 32bit MB to 16bit String
 954 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 955 {
 956     size_t len=0;
 957
 958     while (*(wxUint32*)psz && (!buf || len < n))
 959     {
 960         wxUint16 cc[2];
 961
 962         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
 963         if (pa == (size_t)-1)
 964             return pa;
 965
 966         if (buf)
 967         {
 968             *buf++ = cc[0];
 969             if (pa > 1)
 970                 *buf++ = cc[1];
 971         }
 972         len += pa;
 973         psz += sizeof(wxUint32);
 974     }
 975     if (buf && len<n)   *buf=0;
 976
 977     return len;
 978 }
 979
 980
 981 // copy 16bit String to 32bit MB
 982 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 983 {
 984     size_t len=0;
 985
 986     while (*psz && (!buf || len < n))
 987     {
 988         wxUint32 cc;
 989
 990         // cast is ok for WC_UTF16
 991         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 992         if (pa == (size_t)-1)
 993             return pa;
 994
 995         if (buf)
 996         {
 997             *(wxUint32*)buf = cc;
 998             buf += sizeof(wxUint32);
 999         }
1000         len += sizeof(wxUint32);
1001         psz += pa;
1002     }
1003
1004     if (buf && len<=n-sizeof(wxUint32))
1005         *(wxUint32*)buf=0;
1006
1007     return len;
1008 }
1009
1010
1011
1012 // swap 32bit MB to 16bit String
1013 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1014 {
1015     size_t len=0;
1016
1017     while (*(wxUint32*)psz && (!buf || len < n))
1018     {
1019         char tmp[4];
1020         tmp[0] = psz[3];   tmp[1] = psz[2];
1021         tmp[2] = psz[1];   tmp[3] = psz[0];
1022
1023
1024         wxUint16 cc[2];
1025
1026         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
1027         if (pa == (size_t)-1)
1028             return pa;
1029
1030         if (buf)
1031         {
1032             *buf++ = cc[0];
1033             if (pa > 1)
1034                 *buf++ = cc[1];
1035         }
1036         len += pa;
1037         psz += sizeof(wxUint32);
1038     }
1039
1040     if (buf && len<n)
1041         *buf=0;
1042
1043     return len;
1044 }
1045
1046
1047 // swap 16bit String to 32bit MB
1048 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1049 {
1050     size_t len=0;
1051
1052     while (*psz && (!buf || len < n))
1053     {
1054         char cc[4];
1055
1056         // cast is ok for WC_UTF16
1057         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1058         if (pa == (size_t)-1)
1059             return pa;
1060
1061         if (buf)
1062         {
1063             *buf++ = cc[3];
1064             *buf++ = cc[2];
1065             *buf++ = cc[1];
1066             *buf++ = cc[0];
1067         }
1068         len += sizeof(wxUint32);
1069         psz += pa;
1070     }
1071
1072     if (buf && len<=n-sizeof(wxUint32))
1073         *(wxUint32*)buf=0;
1074
1075     return len;
1076 }
1077
1078 #else // WC_UTF16
1079
1080
1081 // copy 32bit MB to 32bit String
1082 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1083 {
1084     size_t len=0;
1085
1086     while (*(wxUint32*)psz && (!buf || len < n))
1087     {
1088         if (buf)
1089             *buf++ = *(wxUint32*)psz;
1090         len++;
1091         psz += sizeof(wxUint32);
1092     }
1093
1094     if (buf && len<n)
1095         *buf=0;
1096
1097     return len;
1098 }
1099
1100
1101 // copy 32bit String to 32bit MB
1102 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1103 {
1104     size_t len=0;
1105
1106     while (*psz && (!buf || len < n))
1107     {
1108         if (buf)
1109         {
1110             *(wxUint32*)buf = *psz;
1111             buf += sizeof(wxUint32);
1112         }
1113
1114         len += sizeof(wxUint32);
1115         psz++;
1116     }
1117
1118     if (buf && len<=n-sizeof(wxUint32))
1119         *(wxUint32*)buf=0;
1120
1121     return len;
1122 }
1123
1124
1125 // swap 32bit MB to 32bit String
1126 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1127 {
1128     size_t len=0;
1129
1130     while (*(wxUint32*)psz && (!buf || len < n))
1131     {
1132         if (buf)
1133         {
1134             ((char *)buf)[0] = psz[3];
1135             ((char *)buf)[1] = psz[2];
1136             ((char *)buf)[2] = psz[1];
1137             ((char *)buf)[3] = psz[0];
1138             buf++;
1139         }
1140         len++;
1141         psz += sizeof(wxUint32);
1142     }
1143
1144     if (buf && len<n)
1145         *buf=0;
1146
1147     return len;
1148 }
1149
1150
1151 // swap 32bit String to 32bit MB
1152 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1153 {
1154     size_t len=0;
1155
1156     while (*psz && (!buf || len < n))
1157     {
1158         if (buf)
1159         {
1160             *buf++ = ((char *)psz)[3];
1161             *buf++ = ((char *)psz)[2];
1162             *buf++ = ((char *)psz)[1];
1163             *buf++ = ((char *)psz)[0];
1164         }
1165         len += sizeof(wxUint32);
1166         psz++;
1167     }
1168
1169     if (buf && len<=n-sizeof(wxUint32))
1170         *(wxUint32*)buf=0;
1171
1172     return len;
1173 }
1174
1175
1176 #endif // WC_UTF16
1177
1178
1179 // ============================================================================
1180 // The classes doing conversion using the iconv_xxx() functions
1181 // ============================================================================
1182
1183 #ifdef HAVE_ICONV
1184
1185 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1186 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1187 //     (unless there's yet another bug in glibc) the only case when iconv()
1188 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1189 //     left in the input buffer -- when _real_ error occurs,
1190 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1191 //     iconv() failure.
1192 //     [This bug does not appear in glibc 2.2.]
1193 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1194 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1195                                      (errno != E2BIG || bufLeft != 0))
1196 #else
1197 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1198 #endif
1199
1200 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1201
1202 // ----------------------------------------------------------------------------
1203 // wxMBConv_iconv: encapsulates an iconv character set
1204 // ----------------------------------------------------------------------------
1205
1206 class wxMBConv_iconv : public wxMBConv
1207 {
1208 public:
1209     wxMBConv_iconv(const wxChar *name);
1210     virtual ~wxMBConv_iconv();
1211
1212     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1213     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1214
1215     bool IsOk() const
1216         { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1217
1218 protected:
1219     // the iconv handlers used to translate from multibyte to wide char and in
1220     // the other direction
1221     iconv_t m2w,
1222             w2m;
1223 #if wxUSE_THREADS
1224     // guards access to m2w and w2m objects
1225     wxMutex m_iconvMutex;
1226 #endif
1227
1228 private:
1229     // the name (for iconv_open()) of a wide char charset -- if none is
1230     // available on this machine, it will remain NULL
1231     static const char *ms_wcCharsetName;
1232
1233     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1234     // different endian-ness than the native one
1235     static bool ms_wcNeedsSwap;
1236 };
1237
1238 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1239 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1240
1241 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1242 {
1243     // Do it the hard way
1244     char cname[100];
1245     for (size_t i = 0; i < wxStrlen(name)+1; i++)
1246         cname[i] = (char) name[i];
1247
1248     // check for charset that represents wchar_t:
1249     if (ms_wcCharsetName == NULL)
1250     {
1251         ms_wcNeedsSwap = false;
1252
1253         // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1254         ms_wcCharsetName = WC_NAME_BEST;
1255         m2w = iconv_open(ms_wcCharsetName, cname);
1256
1257         if (m2w == (iconv_t)-1)
1258         {
1259             // try charset w/o bytesex info (e.g. "UCS4")
1260             // and check for bytesex ourselves:
1261             ms_wcCharsetName = WC_NAME;
1262             m2w = iconv_open(ms_wcCharsetName, cname);
1263
1264             // last bet, try if it knows WCHAR_T pseudo-charset
1265             if (m2w == (iconv_t)-1)
1266             {
1267                 ms_wcCharsetName = "WCHAR_T";
1268                 m2w = iconv_open(ms_wcCharsetName, cname);
1269             }
1270
1271             if (m2w != (iconv_t)-1)
1272             {
1273                 char    buf[2], *bufPtr;
1274                 wchar_t wbuf[2], *wbufPtr;
1275                 size_t  insz, outsz;
1276                 size_t  res;
1277
1278                 buf[0] = 'A';
1279                 buf[1] = 0;
1280                 wbuf[0] = 0;
1281                 insz = 2;
1282                 outsz = SIZEOF_WCHAR_T * 2;
1283                 wbufPtr = wbuf;
1284                 bufPtr = buf;
1285
1286                 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1287                             (char**)&wbufPtr, &outsz);
1288
1289                 if (ICONV_FAILED(res, insz))
1290                 {
1291                     ms_wcCharsetName = NULL;
1292                     wxLogLastError(wxT("iconv"));
1293                     wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1294                 }
1295                 else
1296                 {
1297                     ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1298                 }
1299             }
1300             else
1301             {
1302                 ms_wcCharsetName = NULL;
1303
1304                 // VS: we must not output an error here, since wxWidgets will safely
1305                 //     fall back to using wxEncodingConverter.
1306                 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1307                 //wxLogError(
1308             }
1309         }
1310         wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1311     }
1312     else // we already have ms_wcCharsetName
1313     {
1314         m2w = iconv_open(ms_wcCharsetName, cname);
1315     }
1316
1317     // NB: don't ever pass NULL to iconv_open(), it may crash!
1318     if ( ms_wcCharsetName )
1319     {
1320         w2m = iconv_open( cname, ms_wcCharsetName);
1321     }
1322     else
1323     {
1324         w2m = (iconv_t)-1;
1325     }
1326 }
1327
1328 wxMBConv_iconv::~wxMBConv_iconv()
1329 {
1330     if ( m2w != (iconv_t)-1 )
1331         iconv_close(m2w);
1332     if ( w2m != (iconv_t)-1 )
1333         iconv_close(w2m);
1334 }
1335
1336 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1337 {
1338 #if wxUSE_THREADS
1339     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1340     //     Unfortunately there is a couple of global wxCSConv objects such as
1341     //     wxConvLocal that are used all over wx code, so we have to make sure
1342     //     the handle is used by at most one thread at the time. Otherwise
1343     //     only a few wx classes would be safe to use from non-main threads
1344     //     as MB<->WC conversion would fail "randomly".
1345     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1346 #endif
1347
1348     size_t inbuf = strlen(psz);
1349     size_t outbuf = n * SIZEOF_WCHAR_T;
1350     size_t res, cres;
1351     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1352     wchar_t *bufPtr = buf;
1353     const char *pszPtr = psz;
1354
1355     if (buf)
1356     {
1357         // have destination buffer, convert there
1358         cres = iconv(m2w,
1359                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1360                      (char**)&bufPtr, &outbuf);
1361         res = n - (outbuf / SIZEOF_WCHAR_T);
1362
1363         if (ms_wcNeedsSwap)
1364         {
1365             // convert to native endianness
1366             WC_BSWAP(buf /* _not_ bufPtr */, res)
1367         }
1368
1369         // NB: iconv was given only strlen(psz) characters on input, and so
1370         //     it couldn't convert the trailing zero. Let's do it ourselves
1371         //     if there's some room left for it in the output buffer.
1372         if (res < n)
1373             buf[res] = 0;
1374     }
1375     else
1376     {
1377         // no destination buffer... convert using temp buffer
1378         // to calculate destination buffer requirement
1379         wchar_t tbuf[8];
1380         res = 0;
1381         do {
1382             bufPtr = tbuf;
1383             outbuf = 8*SIZEOF_WCHAR_T;
1384
1385             cres = iconv(m2w,
1386                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1387                          (char**)&bufPtr, &outbuf );
1388
1389             res += 8-(outbuf/SIZEOF_WCHAR_T);
1390         } while ((cres==(size_t)-1) && (errno==E2BIG));
1391     }
1392
1393     if (ICONV_FAILED(cres, inbuf))
1394     {
1395         //VS: it is ok if iconv fails, hence trace only
1396         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1397         return (size_t)-1;
1398     }
1399
1400     return res;
1401 }
1402
1403 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1404 {
1405 #if wxUSE_THREADS
1406     // NB: explained in MB2WC
1407     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1408 #endif
1409
1410     size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1411     size_t outbuf = n;
1412     size_t res, cres;
1413
1414     wchar_t *tmpbuf = 0;
1415
1416     if (ms_wcNeedsSwap)
1417     {
1418         // need to copy to temp buffer to switch endianness
1419         // this absolutely doesn't rock!
1420         // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1421         //  could be in read-only memory, or be accessed in some other thread)
1422         tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1423         memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1424         WC_BSWAP(tmpbuf, inbuf)
1425         psz=tmpbuf;
1426     }
1427
1428     if (buf)
1429     {
1430         // have destination buffer, convert there
1431         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1432
1433         res = n-outbuf;
1434
1435         // NB: iconv was given only wcslen(psz) characters on input, and so
1436         //     it couldn't convert the trailing zero. Let's do it ourselves
1437         //     if there's some room left for it in the output buffer.
1438         if (res < n)
1439             buf[0] = 0;
1440     }
1441     else
1442     {
1443         // no destination buffer... convert using temp buffer
1444         // to calculate destination buffer requirement
1445         char tbuf[16];
1446         res = 0;
1447         do {
1448             buf = tbuf; outbuf = 16;
1449
1450             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1451
1452             res += 16 - outbuf;
1453         } while ((cres==(size_t)-1) && (errno==E2BIG));
1454     }
1455
1456     if (ms_wcNeedsSwap)
1457     {
1458         free(tmpbuf);
1459     }
1460
1461     if (ICONV_FAILED(cres, inbuf))
1462     {
1463         //VS: it is ok if iconv fails, hence trace only
1464         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1465         return (size_t)-1;
1466     }
1467
1468     return res;
1469 }
1470
1471 #endif // HAVE_ICONV
1472
1473
1474 // ============================================================================
1475 // Win32 conversion classes
1476 // ============================================================================
1477
1478 #ifdef wxHAVE_WIN32_MB2WC
1479
1480 // from utils.cpp
1481 #if wxUSE_FONTMAP
1482 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1483 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1484 #endif
1485
1486 class wxMBConv_win32 : public wxMBConv
1487 {
1488 public:
1489     wxMBConv_win32()
1490     {
1491         m_CodePage = CP_ACP;
1492     }
1493
1494 #if wxUSE_FONTMAP
1495     wxMBConv_win32(const wxChar* name)
1496     {
1497         m_CodePage = wxCharsetToCodepage(name);
1498     }
1499
1500     wxMBConv_win32(wxFontEncoding encoding)
1501     {
1502         m_CodePage = wxEncodingToCodepage(encoding);
1503     }
1504 #endif
1505
1506     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1507     {
1508         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1509         // the behaviour is not compatible with the Unix version (using iconv)
1510         // and break the library itself, e.g. wxTextInputStream::NextChar()
1511         // wouldn't work if reading an incomplete MB char didn't result in an
1512         // error
1513         //
1514         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1515         // an error (tested under Windows Server 2003) and apparently it is
1516         // done on purpose, i.e. the function accepts any input in this case
1517         // and although I'd prefer to return error on ill-formed output, our
1518         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1519         // explicitly ill-formed according to RFC 2152) neither so we don't
1520         // even have any fallback here...
1521         int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1522
1523         const size_t len = ::MultiByteToWideChar
1524                              (
1525                                 m_CodePage,     // code page
1526                                 flags,          // flags: fall on error
1527                                 psz,            // input string
1528                                 -1,             // its length (NUL-terminated)
1529                                 buf,            // output string
1530                                 buf ? n : 0     // size of output buffer
1531                              );
1532
1533         // note that it returns count of written chars for buf != NULL and size
1534         // of the needed buffer for buf == NULL so in either case the length of
1535         // the string (which never includes the terminating NUL) is one less
1536         return len ? len - 1 : (size_t)-1;
1537     }
1538
1539     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1540     {
1541         /*
1542             we have a problem here: by default, WideCharToMultiByte() may
1543             replace characters unrepresentable in the target code page with bad
1544             quality approximations such as turning "1/2" symbol (U+00BD) into
1545             "1" for the code pages which don't have it and we, obviously, want
1546             to avoid this at any price
1547
1548             the trouble is that this function does it _silently_, i.e. it won't
1549             even tell us whether it did or not... Win98/2000 and higher provide
1550             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1551             we have to resort to a round trip, i.e. check that converting back
1552             results in the same string -- this is, of course, expensive but
1553             otherwise we simply can't be sure to not garble the data.
1554          */
1555
1556         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1557         // it doesn't work with CJK encodings (which we test for rather roughly
1558         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1559         // supporting it
1560         BOOL usedDef wxDUMMY_INITIALIZE(false);
1561         BOOL *pUsedDef;
1562         int flags;
1563         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1564         {
1565             // it's our lucky day
1566             flags = WC_NO_BEST_FIT_CHARS;
1567             pUsedDef = &usedDef;
1568         }
1569         else // old system or unsupported encoding
1570         {
1571             flags = 0;
1572             pUsedDef = NULL;
1573         }
1574
1575         const size_t len = ::WideCharToMultiByte
1576                              (
1577                                 m_CodePage,     // code page
1578                                 flags,          // either none or no best fit
1579                                 pwz,            // input string
1580                                 -1,             // it is (wide) NUL-terminated
1581                                 buf,            // output buffer
1582                                 buf ? n : 0,    // and its size
1583                                 NULL,           // default "replacement" char
1584                                 pUsedDef        // [out] was it used?
1585                              );
1586
1587         if ( !len )
1588         {
1589             // function totally failed
1590             return (size_t)-1;
1591         }
1592
1593         // if we were really converting, check if we succeeded
1594         if ( buf )
1595         {
1596             if ( flags )
1597             {
1598                 // check if the conversion failed, i.e. if any replacements
1599                 // were done
1600                 if ( usedDef )
1601                     return (size_t)-1;
1602             }
1603             else // we must resort to double tripping...
1604             {
1605                 wxWCharBuffer wcBuf(n);
1606                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1607                         wcscmp(wcBuf, pwz) != 0 )
1608                 {
1609                     // we didn't obtain the same thing we started from, hence
1610                     // the conversion was lossy and we consider that it failed
1611                     return (size_t)-1;
1612                 }
1613             }
1614         }
1615
1616         // see the comment above for the reason of "len - 1"
1617         return len - 1;
1618     }
1619
1620     bool IsOk() const { return m_CodePage != -1; }
1621
1622 private:
1623     static bool CanUseNoBestFit()
1624     {
1625         static int s_isWin98Or2k = -1;
1626
1627         if ( s_isWin98Or2k == -1 )
1628         {
1629             int verMaj, verMin;
1630             switch ( wxGetOsVersion(&verMaj, &verMin) )
1631             {
1632                 case wxWIN95:
1633                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1634                     break;
1635
1636                 case wxWINDOWS_NT:
1637                     s_isWin98Or2k = verMaj >= 5;
1638                     break;
1639
1640                 default:
1641                     // unknown, be conseravtive by default
1642                     s_isWin98Or2k = 0;
1643             }
1644
1645             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1646         }
1647
1648         return s_isWin98Or2k == 1;
1649     }
1650
1651     long m_CodePage;
1652 };
1653
1654 #endif // wxHAVE_WIN32_MB2WC
1655
1656 // ============================================================================
1657 // Cocoa conversion classes
1658 // ============================================================================
1659
1660 #if defined(__WXCOCOA__)
1661
1662 // RN:  There is no UTF-32 support in either Core Foundation or
1663 // Cocoa.  Strangely enough, internally Core Foundation uses
1664 // UTF 32 internally quite a bit - its just not public (yet).
1665
1666 #include <CoreFoundation/CFString.h>
1667 #include <CoreFoundation/CFStringEncodingExt.h>
1668
1669 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1670 {
1671     CFStringEncoding enc = kCFStringEncodingInvalidId ;
1672     if ( encoding == wxFONTENCODING_DEFAULT )
1673     {
1674         enc = CFStringGetSystemEncoding();
1675     }
1676     else switch( encoding)
1677     {
1678         case wxFONTENCODING_ISO8859_1 :
1679             enc = kCFStringEncodingISOLatin1 ;
1680             break ;
1681         case wxFONTENCODING_ISO8859_2 :
1682             enc = kCFStringEncodingISOLatin2;
1683             break ;
1684         case wxFONTENCODING_ISO8859_3 :
1685             enc = kCFStringEncodingISOLatin3 ;
1686             break ;
1687         case wxFONTENCODING_ISO8859_4 :
1688             enc = kCFStringEncodingISOLatin4;
1689             break ;
1690         case wxFONTENCODING_ISO8859_5 :
1691             enc = kCFStringEncodingISOLatinCyrillic;
1692             break ;
1693         case wxFONTENCODING_ISO8859_6 :
1694             enc = kCFStringEncodingISOLatinArabic;
1695             break ;
1696         case wxFONTENCODING_ISO8859_7 :
1697             enc = kCFStringEncodingISOLatinGreek;
1698             break ;
1699         case wxFONTENCODING_ISO8859_8 :
1700             enc = kCFStringEncodingISOLatinHebrew;
1701             break ;
1702         case wxFONTENCODING_ISO8859_9 :
1703             enc = kCFStringEncodingISOLatin5;
1704             break ;
1705         case wxFONTENCODING_ISO8859_10 :
1706             enc = kCFStringEncodingISOLatin6;
1707             break ;
1708         case wxFONTENCODING_ISO8859_11 :
1709             enc = kCFStringEncodingISOLatinThai;
1710             break ;
1711         case wxFONTENCODING_ISO8859_13 :
1712             enc = kCFStringEncodingISOLatin7;
1713             break ;
1714         case wxFONTENCODING_ISO8859_14 :
1715             enc = kCFStringEncodingISOLatin8;
1716             break ;
1717         case wxFONTENCODING_ISO8859_15 :
1718             enc = kCFStringEncodingISOLatin9;
1719             break ;
1720
1721         case wxFONTENCODING_KOI8 :
1722             enc = kCFStringEncodingKOI8_R;
1723             break ;
1724         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1725             enc = kCFStringEncodingDOSRussian;
1726             break ;
1727
1728 //      case wxFONTENCODING_BULGARIAN :
1729 //          enc = ;
1730 //          break ;
1731
1732         case wxFONTENCODING_CP437 :
1733             enc =kCFStringEncodingDOSLatinUS ;
1734             break ;
1735         case wxFONTENCODING_CP850 :
1736             enc = kCFStringEncodingDOSLatin1;
1737             break ;
1738         case wxFONTENCODING_CP852 :
1739             enc = kCFStringEncodingDOSLatin2;
1740             break ;
1741         case wxFONTENCODING_CP855 :
1742             enc = kCFStringEncodingDOSCyrillic;
1743             break ;
1744         case wxFONTENCODING_CP866 :
1745             enc =kCFStringEncodingDOSRussian ;
1746             break ;
1747         case wxFONTENCODING_CP874 :
1748             enc = kCFStringEncodingDOSThai;
1749             break ;
1750         case wxFONTENCODING_CP932 :
1751             enc = kCFStringEncodingDOSJapanese;
1752             break ;
1753         case wxFONTENCODING_CP936 :
1754             enc =kCFStringEncodingDOSChineseSimplif ;
1755             break ;
1756         case wxFONTENCODING_CP949 :
1757             enc = kCFStringEncodingDOSKorean;
1758             break ;
1759         case wxFONTENCODING_CP950 :
1760             enc = kCFStringEncodingDOSChineseTrad;
1761             break ;
1762         case wxFONTENCODING_CP1250 :
1763             enc = kCFStringEncodingWindowsLatin2;
1764             break ;
1765         case wxFONTENCODING_CP1251 :
1766             enc =kCFStringEncodingWindowsCyrillic ;
1767             break ;
1768         case wxFONTENCODING_CP1252 :
1769             enc =kCFStringEncodingWindowsLatin1 ;
1770             break ;
1771         case wxFONTENCODING_CP1253 :
1772             enc = kCFStringEncodingWindowsGreek;
1773             break ;
1774         case wxFONTENCODING_CP1254 :
1775             enc = kCFStringEncodingWindowsLatin5;
1776             break ;
1777         case wxFONTENCODING_CP1255 :
1778             enc =kCFStringEncodingWindowsHebrew ;
1779             break ;
1780         case wxFONTENCODING_CP1256 :
1781             enc =kCFStringEncodingWindowsArabic ;
1782             break ;
1783         case wxFONTENCODING_CP1257 :
1784             enc = kCFStringEncodingWindowsBalticRim;
1785             break ;
1786 //   This only really encodes to UTF7 (if that) evidently
1787 //        case wxFONTENCODING_UTF7 :
1788 //            enc = kCFStringEncodingNonLossyASCII ;
1789 //            break ;
1790         case wxFONTENCODING_UTF8 :
1791             enc = kCFStringEncodingUTF8 ;
1792             break ;
1793         case wxFONTENCODING_EUC_JP :
1794             enc = kCFStringEncodingEUC_JP;
1795             break ;
1796         case wxFONTENCODING_UTF16 :
1797             enc = kCFStringEncodingUnicode ;
1798             break ;
1799         case wxFONTENCODING_MACROMAN :
1800             enc = kCFStringEncodingMacRoman ;
1801             break ;
1802         case wxFONTENCODING_MACJAPANESE :
1803             enc = kCFStringEncodingMacJapanese ;
1804             break ;
1805         case wxFONTENCODING_MACCHINESETRAD :
1806             enc = kCFStringEncodingMacChineseTrad ;
1807             break ;
1808         case wxFONTENCODING_MACKOREAN :
1809             enc = kCFStringEncodingMacKorean ;
1810             break ;
1811         case wxFONTENCODING_MACARABIC :
1812             enc = kCFStringEncodingMacArabic ;
1813             break ;
1814         case wxFONTENCODING_MACHEBREW :
1815             enc = kCFStringEncodingMacHebrew ;
1816             break ;
1817         case wxFONTENCODING_MACGREEK :
1818             enc = kCFStringEncodingMacGreek ;
1819             break ;
1820         case wxFONTENCODING_MACCYRILLIC :
1821             enc = kCFStringEncodingMacCyrillic ;
1822             break ;
1823         case wxFONTENCODING_MACDEVANAGARI :
1824             enc = kCFStringEncodingMacDevanagari ;
1825             break ;
1826         case wxFONTENCODING_MACGURMUKHI :
1827             enc = kCFStringEncodingMacGurmukhi ;
1828             break ;
1829         case wxFONTENCODING_MACGUJARATI :
1830             enc = kCFStringEncodingMacGujarati ;
1831             break ;
1832         case wxFONTENCODING_MACORIYA :
1833             enc = kCFStringEncodingMacOriya ;
1834             break ;
1835         case wxFONTENCODING_MACBENGALI :
1836             enc = kCFStringEncodingMacBengali ;
1837             break ;
1838         case wxFONTENCODING_MACTAMIL :
1839             enc = kCFStringEncodingMacTamil ;
1840             break ;
1841         case wxFONTENCODING_MACTELUGU :
1842             enc = kCFStringEncodingMacTelugu ;
1843             break ;
1844         case wxFONTENCODING_MACKANNADA :
1845             enc = kCFStringEncodingMacKannada ;
1846             break ;
1847         case wxFONTENCODING_MACMALAJALAM :
1848             enc = kCFStringEncodingMacMalayalam ;
1849             break ;
1850         case wxFONTENCODING_MACSINHALESE :
1851             enc = kCFStringEncodingMacSinhalese ;
1852             break ;
1853         case wxFONTENCODING_MACBURMESE :
1854             enc = kCFStringEncodingMacBurmese ;
1855             break ;
1856         case wxFONTENCODING_MACKHMER :
1857             enc = kCFStringEncodingMacKhmer ;
1858             break ;
1859         case wxFONTENCODING_MACTHAI :
1860             enc = kCFStringEncodingMacThai ;
1861             break ;
1862         case wxFONTENCODING_MACLAOTIAN :
1863             enc = kCFStringEncodingMacLaotian ;
1864             break ;
1865         case wxFONTENCODING_MACGEORGIAN :
1866             enc = kCFStringEncodingMacGeorgian ;
1867             break ;
1868         case wxFONTENCODING_MACARMENIAN :
1869             enc = kCFStringEncodingMacArmenian ;
1870             break ;
1871         case wxFONTENCODING_MACCHINESESIMP :
1872             enc = kCFStringEncodingMacChineseSimp ;
1873             break ;
1874         case wxFONTENCODING_MACTIBETAN :
1875             enc = kCFStringEncodingMacTibetan ;
1876             break ;
1877         case wxFONTENCODING_MACMONGOLIAN :
1878             enc = kCFStringEncodingMacMongolian ;
1879             break ;
1880         case wxFONTENCODING_MACETHIOPIC :
1881             enc = kCFStringEncodingMacEthiopic ;
1882             break ;
1883         case wxFONTENCODING_MACCENTRALEUR :
1884             enc = kCFStringEncodingMacCentralEurRoman ;
1885             break ;
1886         case wxFONTENCODING_MACVIATNAMESE :
1887             enc = kCFStringEncodingMacVietnamese ;
1888             break ;
1889         case wxFONTENCODING_MACARABICEXT :
1890             enc = kCFStringEncodingMacExtArabic ;
1891             break ;
1892         case wxFONTENCODING_MACSYMBOL :
1893             enc = kCFStringEncodingMacSymbol ;
1894             break ;
1895         case wxFONTENCODING_MACDINGBATS :
1896             enc = kCFStringEncodingMacDingbats ;
1897             break ;
1898         case wxFONTENCODING_MACTURKISH :
1899             enc = kCFStringEncodingMacTurkish ;
1900             break ;
1901         case wxFONTENCODING_MACCROATIAN :
1902             enc = kCFStringEncodingMacCroatian ;
1903             break ;
1904         case wxFONTENCODING_MACICELANDIC :
1905             enc = kCFStringEncodingMacIcelandic ;
1906             break ;
1907         case wxFONTENCODING_MACROMANIAN :
1908             enc = kCFStringEncodingMacRomanian ;
1909             break ;
1910         case wxFONTENCODING_MACCELTIC :
1911             enc = kCFStringEncodingMacCeltic ;
1912             break ;
1913         case wxFONTENCODING_MACGAELIC :
1914             enc = kCFStringEncodingMacGaelic ;
1915             break ;
1916 //      case wxFONTENCODING_MACKEYBOARD :
1917 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
1918 //          break ;
1919         default :
1920             // because gcc is picky
1921             break ;
1922     } ;
1923     return enc ;
1924 }
1925
1926 class wxMBConv_cocoa : public wxMBConv
1927 {
1928 public:
1929     wxMBConv_cocoa()
1930     {
1931         Init(CFStringGetSystemEncoding()) ;
1932     }
1933
1934 #if wxUSE_FONTMAP
1935     wxMBConv_cocoa(const wxChar* name)
1936     {
1937         Init( wxCFStringEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
1938     }
1939 #endif
1940
1941     wxMBConv_cocoa(wxFontEncoding encoding)
1942     {
1943         Init( wxCFStringEncFromFontEnc(encoding) );
1944     }
1945
1946     ~wxMBConv_cocoa()
1947     {
1948     }
1949
1950     void Init( CFStringEncoding encoding)
1951     {
1952         m_encoding = encoding ;
1953     }
1954
1955     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1956     {
1957         wxASSERT(szUnConv);
1958
1959         CFStringRef theString = CFStringCreateWithBytes (
1960                                                 NULL, //the allocator
1961                                                 (const UInt8*)szUnConv,
1962                                                 strlen(szUnConv),
1963                                                 m_encoding,
1964                                                 false //no BOM/external representation
1965                                                 );
1966
1967         wxASSERT(theString);
1968
1969         size_t nOutLength = CFStringGetLength(theString);
1970
1971         if (szOut == NULL)
1972         {
1973             CFRelease(theString);
1974             return nOutLength;
1975         }
1976
1977         CFRange theRange = { 0, nOutSize };
1978
1979 #if SIZEOF_WCHAR_T == 4
1980         UniChar* szUniCharBuffer = new UniChar[nOutSize];
1981 #endif
1982
1983         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
1984
1985         CFRelease(theString);
1986
1987         szUniCharBuffer[nOutLength] = '\0' ;
1988
1989 #if SIZEOF_WCHAR_T == 4
1990         wxMBConvUTF16 converter ;
1991         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
1992         delete[] szUniCharBuffer;
1993 #endif
1994
1995         return nOutLength;
1996     }
1997
1998     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1999     {
2000         wxASSERT(szUnConv);
2001
2002         size_t nRealOutSize;
2003         size_t nBufSize = wxWcslen(szUnConv);
2004         UniChar* szUniBuffer = (UniChar*) szUnConv;
2005
2006 #if SIZEOF_WCHAR_T == 4
2007         wxMBConvUTF16BE converter ;
2008         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2009         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2010         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2011         nBufSize /= sizeof(UniChar);
2012 #endif
2013
2014         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2015                                 NULL, //allocator
2016                                 szUniBuffer,
2017                                 nBufSize,
2018                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
2019                             );
2020
2021         wxASSERT(theString);
2022
2023         //Note that CER puts a BOM when converting to unicode
2024         //so we  check and use getchars instead in that case
2025         if (m_encoding == kCFStringEncodingUnicode)
2026         {
2027             if (szOut != NULL)
2028                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
2029
2030             nRealOutSize = CFStringGetLength(theString) + 1;
2031         }
2032         else
2033         {
2034             CFStringGetBytes(
2035                 theString,
2036                 CFRangeMake(0, CFStringGetLength(theString)),
2037                 m_encoding,
2038                 0, //what to put in characters that can't be converted -
2039                     //0 tells CFString to return NULL if it meets such a character
2040                 false, //not an external representation
2041                 (UInt8*) szOut,
2042                 nOutSize,
2043                 (CFIndex*) &nRealOutSize
2044                         );
2045         }
2046
2047         CFRelease(theString);
2048
2049 #if SIZEOF_WCHAR_T == 4
2050         delete[] szUniBuffer;
2051 #endif
2052
2053         return  nRealOutSize - 1;
2054     }
2055
2056     bool IsOk() const
2057     {
2058         return m_encoding != kCFStringEncodingInvalidId &&
2059               CFStringIsEncodingAvailable(m_encoding);
2060     }
2061
2062 private:
2063     CFStringEncoding m_encoding ;
2064 };
2065
2066 #endif // defined(__WXCOCOA__)
2067
2068 // ============================================================================
2069 // Mac conversion classes
2070 // ============================================================================
2071
2072 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2073
2074 class wxMBConv_mac : public wxMBConv
2075 {
2076 public:
2077     wxMBConv_mac()
2078     {
2079         Init(CFStringGetSystemEncoding()) ;
2080     }
2081
2082 #if wxUSE_FONTMAP
2083     wxMBConv_mac(const wxChar* name)
2084     {
2085         Init( wxMacGetSystemEncFromFontEnc(wxFontMapperBase::Get()->CharsetToEncoding(name, false) ) ) ;
2086     }
2087 #endif
2088
2089     wxMBConv_mac(wxFontEncoding encoding)
2090     {
2091         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2092     }
2093
2094     ~wxMBConv_mac()
2095     {
2096         OSStatus status = noErr ;
2097         status = TECDisposeConverter(m_MB2WC_converter);
2098         status = TECDisposeConverter(m_WC2MB_converter);
2099     }
2100
2101
2102     void Init( TextEncodingBase encoding)
2103     {
2104         OSStatus status = noErr ;
2105         m_char_encoding = encoding ;
2106         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2107
2108         status = TECCreateConverter(&m_MB2WC_converter,
2109                                     m_char_encoding,
2110                                     m_unicode_encoding);
2111         status = TECCreateConverter(&m_WC2MB_converter,
2112                                     m_unicode_encoding,
2113                                     m_char_encoding);
2114     }
2115
2116     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2117     {
2118         OSStatus status = noErr ;
2119         ByteCount byteOutLen ;
2120         ByteCount byteInLen = strlen(psz) ;
2121         wchar_t *tbuf = NULL ;
2122         UniChar* ubuf = NULL ;
2123         size_t res = 0 ;
2124
2125         if (buf == NULL)
2126         {
2127             //apple specs say at least 32
2128             n = wxMax( 32 , byteInLen ) ;
2129             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2130         }
2131         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2132 #if SIZEOF_WCHAR_T == 4
2133         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2134 #else
2135         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2136 #endif
2137         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2138           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2139 #if SIZEOF_WCHAR_T == 4
2140         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2141         // is not properly terminated we get random characters at the end
2142         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2143         wxMBConvUTF16BE converter ;
2144         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2145         free( ubuf ) ;
2146 #else
2147         res = byteOutLen / sizeof( UniChar ) ;
2148 #endif
2149         if ( buf == NULL )
2150              free(tbuf) ;
2151
2152         if ( buf  && res < n)
2153             buf[res] = 0;
2154
2155         return res ;
2156     }
2157
2158     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2159     {
2160         OSStatus status = noErr ;
2161         ByteCount byteOutLen ;
2162         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2163
2164         char *tbuf = NULL ;
2165
2166         if (buf == NULL)
2167         {
2168             //apple specs say at least 32
2169             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2170             tbuf = (char*) malloc( n ) ;
2171         }
2172
2173         ByteCount byteBufferLen = n ;
2174         UniChar* ubuf = NULL ;
2175 #if SIZEOF_WCHAR_T == 4
2176         wxMBConvUTF16BE converter ;
2177         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2178         byteInLen = unicharlen ;
2179         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2180         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2181 #else
2182         ubuf = (UniChar*) psz ;
2183 #endif
2184         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2185             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2186 #if SIZEOF_WCHAR_T == 4
2187         free( ubuf ) ;
2188 #endif
2189         if ( buf == NULL )
2190             free(tbuf) ;
2191
2192         size_t res = byteOutLen ;
2193         if ( buf  && res < n)
2194         {
2195             buf[res] = 0;
2196
2197             //we need to double-trip to verify it didn't insert any ? in place
2198             //of bogus characters
2199             wxWCharBuffer wcBuf(n);
2200             size_t pszlen = wxWcslen(psz);
2201             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2202                         wxWcslen(wcBuf) != pszlen ||
2203                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2204             {
2205                 // we didn't obtain the same thing we started from, hence
2206                 // the conversion was lossy and we consider that it failed
2207                 return (size_t)-1;
2208             }
2209         }
2210
2211         return res ;
2212     }
2213
2214     bool IsOk() const
2215         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2216
2217 private:
2218     TECObjectRef m_MB2WC_converter ;
2219     TECObjectRef m_WC2MB_converter ;
2220
2221     TextEncodingBase m_char_encoding ;
2222     TextEncodingBase m_unicode_encoding ;
2223 };
2224
2225 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2226
2227 // ============================================================================
2228 // wxEncodingConverter based conversion classes
2229 // ============================================================================
2230
2231 #if wxUSE_FONTMAP
2232
2233 class wxMBConv_wxwin : public wxMBConv
2234 {
2235 private:
2236     void Init()
2237     {
2238         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2239                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2240     }
2241
2242 public:
2243     // temporarily just use wxEncodingConverter stuff,
2244     // so that it works while a better implementation is built
2245     wxMBConv_wxwin(const wxChar* name)
2246     {
2247         if (name)
2248             m_enc = wxFontMapperBase::Get()->CharsetToEncoding(name, false);
2249         else
2250             m_enc = wxFONTENCODING_SYSTEM;
2251
2252         Init();
2253     }
2254
2255     wxMBConv_wxwin(wxFontEncoding enc)
2256     {
2257         m_enc = enc;
2258
2259         Init();
2260     }
2261
2262     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2263     {
2264         size_t inbuf = strlen(psz);
2265         if (buf)
2266         {
2267             if (!m2w.Convert(psz,buf))
2268                 return (size_t)-1;
2269         }
2270         return inbuf;
2271     }
2272
2273     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2274     {
2275         const size_t inbuf = wxWcslen(psz);
2276         if (buf)
2277         {
2278             if (!w2m.Convert(psz,buf))
2279                 return (size_t)-1;
2280         }
2281
2282         return inbuf;
2283     }
2284
2285     bool IsOk() const { return m_ok; }
2286
2287 public:
2288     wxFontEncoding m_enc;
2289     wxEncodingConverter m2w, w2m;
2290
2291     // were we initialized successfully?
2292     bool m_ok;
2293
2294     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2295 };
2296
2297 #endif // wxUSE_FONTMAP
2298
2299 // ============================================================================
2300 // wxCSConv implementation
2301 // ============================================================================
2302
2303 void wxCSConv::Init()
2304 {
2305     m_name = NULL;
2306     m_convReal =  NULL;
2307     m_deferred = true;
2308 }
2309
2310 wxCSConv::wxCSConv(const wxChar *charset)
2311 {
2312     Init();
2313
2314     if ( charset )
2315     {
2316         SetName(charset);
2317     }
2318
2319     m_encoding = wxFONTENCODING_SYSTEM;
2320 }
2321
2322 wxCSConv::wxCSConv(wxFontEncoding encoding)
2323 {
2324     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2325     {
2326         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2327
2328         encoding = wxFONTENCODING_SYSTEM;
2329     }
2330
2331     Init();
2332
2333     m_encoding = encoding;
2334 }
2335
2336 wxCSConv::~wxCSConv()
2337 {
2338     Clear();
2339 }
2340
2341 wxCSConv::wxCSConv(const wxCSConv& conv)
2342         : wxMBConv()
2343 {
2344     Init();
2345
2346     SetName(conv.m_name);
2347     m_encoding = conv.m_encoding;
2348 }
2349
2350 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2351 {
2352     Clear();
2353
2354     SetName(conv.m_name);
2355     m_encoding = conv.m_encoding;
2356
2357     return *this;
2358 }
2359
2360 void wxCSConv::Clear()
2361 {
2362     free(m_name);
2363     delete m_convReal;
2364
2365     m_name = NULL;
2366     m_convReal = NULL;
2367 }
2368
2369 void wxCSConv::SetName(const wxChar *charset)
2370 {
2371     if (charset)
2372     {
2373         m_name = wxStrdup(charset);
2374         m_deferred = true;
2375     }
2376 }
2377
2378 wxMBConv *wxCSConv::DoCreate() const
2379 {
2380     // check for the special case of ASCII or ISO8859-1 charset: as we have
2381     // special knowledge of it anyhow, we don't need to create a special
2382     // conversion object
2383     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2384     {
2385         // don't convert at all
2386         return NULL;
2387     }
2388
2389     // we trust OS to do conversion better than we can so try external
2390     // conversion methods first
2391     //
2392     // the full order is:
2393     //      1. OS conversion (iconv() under Unix or Win32 API)
2394     //      2. hard coded conversions for UTF
2395     //      3. wxEncodingConverter as fall back
2396
2397     // step (1)
2398 #ifdef HAVE_ICONV
2399 #if !wxUSE_FONTMAP
2400     if ( m_name )
2401 #endif // !wxUSE_FONTMAP
2402     {
2403         wxString name(m_name);
2404
2405 #if wxUSE_FONTMAP
2406         if ( name.empty() )
2407             name = wxFontMapperBase::Get()->GetEncodingName(m_encoding);
2408 #endif // wxUSE_FONTMAP
2409
2410         wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2411         if ( conv->IsOk() )
2412             return conv;
2413
2414         delete conv;
2415     }
2416 #endif // HAVE_ICONV
2417
2418 #ifdef wxHAVE_WIN32_MB2WC
2419     {
2420 #if wxUSE_FONTMAP
2421         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2422                                       : new wxMBConv_win32(m_encoding);
2423         if ( conv->IsOk() )
2424             return conv;
2425
2426         delete conv;
2427 #else
2428         return NULL;
2429 #endif
2430     }
2431 #endif // wxHAVE_WIN32_MB2WC
2432 #if defined(__WXMAC__)
2433     {
2434         // leave UTF16 and UTF32 to the built-ins of wx
2435         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ||
2436             ( m_encoding >= wxFONTENCODING_MACMIN && m_encoding <= wxFONTENCODING_MACMAX ) ) )
2437         {
2438
2439 #if wxUSE_FONTMAP
2440             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2441                                         : new wxMBConv_mac(m_encoding);
2442 #else
2443             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2444 #endif
2445             if ( conv->IsOk() )
2446                  return conv;
2447
2448             delete conv;
2449         }
2450     }
2451 #endif
2452 #if defined(__WXCOCOA__)
2453     {
2454         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2455         {
2456
2457 #if wxUSE_FONTMAP
2458             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2459                                           : new wxMBConv_cocoa(m_encoding);
2460 #else
2461             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2462 #endif
2463             if ( conv->IsOk() )
2464                  return conv;
2465
2466             delete conv;
2467         }
2468     }
2469 #endif
2470     // step (2)
2471     wxFontEncoding enc = m_encoding;
2472 #if wxUSE_FONTMAP
2473     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2474     {
2475         // use "false" to suppress interactive dialogs -- we can be called from
2476         // anywhere and popping up a dialog from here is the last thing we want to
2477         // do
2478         enc = wxFontMapperBase::Get()->CharsetToEncoding(m_name, false);
2479     }
2480 #endif // wxUSE_FONTMAP
2481
2482     switch ( enc )
2483     {
2484         case wxFONTENCODING_UTF7:
2485              return new wxMBConvUTF7;
2486
2487         case wxFONTENCODING_UTF8:
2488              return new wxMBConvUTF8;
2489
2490         case wxFONTENCODING_UTF16BE:
2491              return new wxMBConvUTF16BE;
2492
2493         case wxFONTENCODING_UTF16LE:
2494              return new wxMBConvUTF16LE;
2495
2496         case wxFONTENCODING_UTF32BE:
2497              return new wxMBConvUTF32BE;
2498
2499         case wxFONTENCODING_UTF32LE:
2500              return new wxMBConvUTF32LE;
2501
2502         default:
2503              // nothing to do but put here to suppress gcc warnings
2504              ;
2505     }
2506
2507     // step (3)
2508 #if wxUSE_FONTMAP
2509     {
2510         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2511                                       : new wxMBConv_wxwin(m_encoding);
2512         if ( conv->IsOk() )
2513             return conv;
2514
2515         delete conv;
2516     }
2517 #endif // wxUSE_FONTMAP
2518
2519     // NB: This is a hack to prevent deadlock. What could otherwise happen
2520     //     in Unicode build: wxConvLocal creation ends up being here
2521     //     because of some failure and logs the error. But wxLog will try to
2522     //     attach timestamp, for which it will need wxConvLocal (to convert
2523     //     time to char* and then wchar_t*), but that fails, tries to log
2524     //     error, but wxLog has a (already locked) critical section that
2525     //     guards static buffer.
2526     static bool alreadyLoggingError = false;
2527     if (!alreadyLoggingError)
2528     {
2529         alreadyLoggingError = true;
2530         wxLogError(_("Cannot convert from the charset '%s'!"),
2531                    m_name ? m_name
2532                       :
2533 #if wxUSE_FONTMAP
2534                          wxFontMapperBase::GetEncodingDescription(m_encoding).c_str()
2535 #else // !wxUSE_FONTMAP
2536                          wxString::Format(_("encoding %s"), m_encoding).c_str()
2537 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2538               );
2539         alreadyLoggingError = false;
2540     }
2541
2542     return NULL;
2543 }
2544
2545 void wxCSConv::CreateConvIfNeeded() const
2546 {
2547     if ( m_deferred )
2548     {
2549         wxCSConv *self = (wxCSConv *)this; // const_cast
2550
2551 #if wxUSE_INTL
2552         // if we don't have neither the name nor the encoding, use the default
2553         // encoding for this system
2554         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2555         {
2556             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2557         }
2558 #endif // wxUSE_INTL
2559
2560         self->m_convReal = DoCreate();
2561         self->m_deferred = false;
2562     }
2563 }
2564
2565 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2566 {
2567     CreateConvIfNeeded();
2568
2569     if (m_convReal)
2570         return m_convReal->MB2WC(buf, psz, n);
2571
2572     // latin-1 (direct)
2573     size_t len = strlen(psz);
2574
2575     if (buf)
2576     {
2577         for (size_t c = 0; c <= len; c++)
2578             buf[c] = (unsigned char)(psz[c]);
2579     }
2580
2581     return len;
2582 }
2583
2584 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2585 {
2586     CreateConvIfNeeded();
2587
2588     if (m_convReal)
2589         return m_convReal->WC2MB(buf, psz, n);
2590
2591     // latin-1 (direct)
2592     const size_t len = wxWcslen(psz);
2593     if (buf)
2594     {
2595         for (size_t c = 0; c <= len; c++)
2596         {
2597             if (psz[c] > 0xFF)
2598                 return (size_t)-1;
2599             buf[c] = (char)psz[c];
2600         }
2601     }
2602     else
2603     {
2604         for (size_t c = 0; c <= len; c++)
2605         {
2606             if (psz[c] > 0xFF)
2607                 return (size_t)-1;
2608         }
2609     }
2610
2611     return len;
2612 }
2613
2614 // ----------------------------------------------------------------------------
2615 // globals
2616 // ----------------------------------------------------------------------------
2617
2618 #ifdef __WINDOWS__
2619     static wxMBConv_win32 wxConvLibcObj;
2620 #elif defined(__WXMAC__) && !defined(__MACH__)
2621     static wxMBConv_mac wxConvLibcObj ;
2622 #else
2623     static wxMBConvLibc wxConvLibcObj;
2624 #endif
2625
2626 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2627 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2628 static wxMBConvUTF7 wxConvUTF7Obj;
2629 static wxMBConvUTF8 wxConvUTF8Obj;
2630
2631 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2632 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2633 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2634 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2635 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2636 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2637 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvFileName = &
2638 #ifdef __WXOSX__
2639                                                     wxConvUTF8Obj;
2640 #else
2641                                                     wxConvLibcObj;
2642 #endif
2643
2644
2645 #else // !wxUSE_WCHAR_T
2646
2647 // stand-ins in absence of wchar_t
2648 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2649                                 wxConvISO8859_1,
2650                                 wxConvLocal,
2651                                 wxConvUTF8;
2652
2653 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2654
2655