src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
  24   #pragma implementation "strconv.h"
  25 #endif
  26
  27 // For compilers that support precompilation, includes "wx.h".
  28 #include "wx/wxprec.h"
  29
  30 #ifdef __BORLANDC__
  31   #pragma hdrstop
  32 #endif
  33
  34 #ifndef WX_PRECOMP
  35     #include "wx/intl.h"
  36     #include "wx/log.h"
  37 #endif // WX_PRECOMP
  38
  39 #include "wx/strconv.h"
  40
  41 #if wxUSE_WCHAR_T
  42
  43 #ifdef __WXMSW__
  44     #include "wx/msw/private.h"
  45 #endif
  46
  47 #ifdef __WINDOWS__
  48     #include "wx/msw/missing.h"
  49 #endif
  50
  51 #ifndef __WXWINCE__
  52 #include <errno.h>
  53 #endif
  54
  55 #include <ctype.h>
  56 #include <string.h>
  57 #include <stdlib.h>
  58
  59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  60     #define wxHAVE_WIN32_MB2WC
  61 #endif // __WIN32__ but !__WXMICROWIN__
  62
  63 // ----------------------------------------------------------------------------
  64 // headers
  65 // ----------------------------------------------------------------------------
  66
  67 #ifdef __SALFORDC__
  68     #include <clib.h>
  69 #endif
  70
  71 #ifdef HAVE_ICONV
  72     #include <iconv.h>
  73 #endif
  74
  75 #include "wx/encconv.h"
  76 #include "wx/fontmap.h"
  77 #include "wx/utils.h"
  78
  79 #ifdef __WXMAC__
  80 #include <ATSUnicode.h>
  81 #include <TextCommon.h>
  82 #include <TextEncodingConverter.h>
  83
  84 #include  "wx/mac/private.h"  // includes mac headers
  85 #endif
  86 // ----------------------------------------------------------------------------
  87 // macros
  88 // ----------------------------------------------------------------------------
  89
  90 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
  91 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
  92
  93 #if SIZEOF_WCHAR_T == 4
  94     #define WC_NAME         "UCS4"
  95     #define WC_BSWAP         BSWAP_UCS4
  96     #ifdef WORDS_BIGENDIAN
  97       #define WC_NAME_BEST  "UCS-4BE"
  98     #else
  99       #define WC_NAME_BEST  "UCS-4LE"
 100     #endif
 101 #elif SIZEOF_WCHAR_T == 2
 102     #define WC_NAME         "UTF16"
 103     #define WC_BSWAP         BSWAP_UTF16
 104     #define WC_UTF16
 105     #ifdef WORDS_BIGENDIAN
 106       #define WC_NAME_BEST  "UTF-16BE"
 107     #else
 108       #define WC_NAME_BEST  "UTF-16LE"
 109     #endif
 110 #else // sizeof(wchar_t) != 2 nor 4
 111     // does this ever happen?
 112     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
 113 #endif
 114
 115 // ============================================================================
 116 // implementation
 117 // ============================================================================
 118
 119 // ----------------------------------------------------------------------------
 120 // UTF-16 en/decoding to/from UCS-4
 121 // ----------------------------------------------------------------------------
 122
 123
 124 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 125 {
 126     if (input<=0xffff)
 127     {
 128         if (output)
 129             *output = (wxUint16) input;
 130         return 1;
 131     }
 132     else if (input>=0x110000)
 133     {
 134         return (size_t)-1;
 135     }
 136     else
 137     {
 138         if (output)
 139         {
 140             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 141             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 142         }
 143         return 2;
 144     }
 145 }
 146
 147 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 148 {
 149     if ((*input<0xd800) || (*input>0xdfff))
 150     {
 151         output = *input;
 152         return 1;
 153     }
 154     else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
 155     {
 156         output = *input;
 157         return (size_t)-1;
 158     }
 159     else
 160     {
 161         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 162         return 2;
 163     }
 164 }
 165
 166
 167 // ----------------------------------------------------------------------------
 168 // wxMBConv
 169 // ----------------------------------------------------------------------------
 170
 171 wxMBConv::~wxMBConv()
 172 {
 173     // nothing to do here (necessary for Darwin linking probably)
 174 }
 175
 176 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 177 {
 178     if ( psz )
 179     {
 180         // calculate the length of the buffer needed first
 181         size_t nLen = MB2WC(NULL, psz, 0);
 182         if ( nLen != (size_t)-1 )
 183         {
 184             // now do the actual conversion
 185             wxWCharBuffer buf(nLen);
 186             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 187             if ( nLen != (size_t)-1 )
 188             {
 189                 return buf;
 190             }
 191         }
 192     }
 193
 194     wxWCharBuffer buf((wchar_t *)NULL);
 195
 196     return buf;
 197 }
 198
 199 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 200 {
 201     if ( pwz )
 202     {
 203         size_t nLen = WC2MB(NULL, pwz, 0);
 204         if ( nLen != (size_t)-1 )
 205         {
 206             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 207             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 208             if ( nLen != (size_t)-1 )
 209             {
 210                 return buf;
 211             }
 212         }
 213     }
 214
 215     wxCharBuffer buf((char *)NULL);
 216
 217     return buf;
 218 }
 219
 220 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
 221 {
 222     wxASSERT(pOutSize != NULL);
 223
 224     const char* szEnd = szString + nStringLen + 1;
 225     const char* szPos = szString;
 226     const char* szStart = szPos;
 227
 228     size_t nActualLength = 0;
 229     size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
 230
 231     wxWCharBuffer theBuffer(nCurrentSize);
 232
 233     //Convert the string until the length() is reached, continuing the
 234     //loop every time a null character is reached
 235     while(szPos != szEnd)
 236     {
 237         wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
 238
 239         //Get the length of the current (sub)string
 240         size_t nLen = MB2WC(NULL, szPos, 0);
 241
 242         //Invalid conversion?
 243         if( nLen == (size_t)-1 )
 244         {
 245             *pOutSize = 0;
 246             theBuffer.data()[0u] = wxT('\0');
 247             return theBuffer;
 248         }
 249
 250
 251         //Increase the actual length (+1 for current null character)
 252         nActualLength += nLen + 1;
 253
 254         //if buffer too big, realloc the buffer
 255         if (nActualLength > (nCurrentSize+1))
 256         {
 257             wxWCharBuffer theNewBuffer(nCurrentSize << 1);
 258             memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
 259             theBuffer = theNewBuffer;
 260             nCurrentSize <<= 1;
 261         }
 262
 263         //Convert the current (sub)string
 264         if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
 265         {
 266             *pOutSize = 0;
 267             theBuffer.data()[0u] = wxT('\0');
 268             return theBuffer;
 269         }
 270
 271         //Increment to next (sub)string
 272         //Note that we have to use strlen here instead of nLen
 273         //here because XX2XX gives us the size of the output buffer,
 274         //not neccessarly the length of the string
 275         szPos += strlen(szPos) + 1;
 276     }
 277
 278     //success - return actual length and the buffer
 279     *pOutSize = nActualLength;
 280     return theBuffer;
 281 }
 282
 283 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
 284 {
 285     wxASSERT(pOutSize != NULL);
 286
 287     const wchar_t* szEnd = szString + nStringLen + 1;
 288     const wchar_t* szPos = szString;
 289     const wchar_t* szStart = szPos;
 290
 291     size_t nActualLength = 0;
 292     size_t nCurrentSize = nStringLen << 2; //try * 4 first
 293
 294     wxCharBuffer theBuffer(nCurrentSize);
 295
 296     //Convert the string until the length() is reached, continuing the
 297     //loop every time a null character is reached
 298     while(szPos != szEnd)
 299     {
 300         wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
 301
 302         //Get the length of the current (sub)string
 303         size_t nLen = WC2MB(NULL, szPos, 0);
 304
 305         //Invalid conversion?
 306         if( nLen == (size_t)-1 )
 307         {
 308             *pOutSize = 0;
 309             theBuffer.data()[0u] = wxT('\0');
 310             return theBuffer;
 311         }
 312
 313         //Increase the actual length (+1 for current null character)
 314         nActualLength += nLen + 1;
 315
 316         //if buffer too big, realloc the buffer
 317         if (nActualLength > (nCurrentSize+1))
 318         {
 319             wxCharBuffer theNewBuffer(nCurrentSize << 1);
 320             memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
 321             theBuffer = theNewBuffer;
 322             nCurrentSize <<= 1;
 323         }
 324
 325         //Convert the current (sub)string
 326         if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
 327         {
 328             *pOutSize = 0;
 329             theBuffer.data()[0u] = wxT('\0');
 330             return theBuffer;
 331         }
 332
 333         //Increment to next (sub)string
 334         //Note that we have to use wxWcslen here instead of nLen
 335         //here because XX2XX gives us the size of the output buffer,
 336         //not neccessarly the length of the string
 337         szPos += wxWcslen(szPos) + 1;
 338     }
 339
 340     //success - return actual length and the buffer
 341     *pOutSize = nActualLength;
 342     return theBuffer;
 343 }
 344
 345 // ----------------------------------------------------------------------------
 346 // wxMBConvLibc
 347 // ----------------------------------------------------------------------------
 348
 349 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 350 {
 351     return wxMB2WC(buf, psz, n);
 352 }
 353
 354 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 355 {
 356     return wxWC2MB(buf, psz, n);
 357 }
 358 // ----------------------------------------------------------------------------
 359 // UTF-7
 360 // ----------------------------------------------------------------------------
 361
 362 // Implementation (C) 2004 Fredrik Roubert
 363
 364 //
 365 // BASE64 decoding table
 366 //
 367 static const unsigned char utf7unb64[] =
 368 {
 369     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 370     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 371     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 372     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 373     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 374     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 375     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 376     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 377     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 378     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 379     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 380     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 381     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 382     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 383     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 384     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 385     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 386     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 387     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 388     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 389     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 390     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 391     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 392     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 393     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 394     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 395     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 396     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 397     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 398     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 399     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 400     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 401 };
 402
 403 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 404 {
 405
 406     size_t len = 0;
 407
 408     while (*psz && ((!buf) || (len < n)))
 409     {
 410         unsigned char cc = *psz++;
 411         if (cc != '+')
 412         {
 413             // plain ASCII char
 414             if (buf)
 415                 *buf++ = cc;
 416             len++;
 417         }
 418         else if (*psz == '-')
 419         {
 420             // encoded plus sign
 421             if (buf)
 422                 *buf++ = cc;
 423             len++;
 424             psz++;
 425         }
 426         else
 427         {
 428             // BASE64 encoded string
 429             bool lsb;
 430             unsigned char c;
 431             unsigned int d, l;
 432             for (lsb = false, d = 0, l = 0;
 433                 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
 434             {
 435                 d <<= 6;
 436                 d += cc;
 437                 for (l += 6; l >= 8; lsb = !lsb)
 438                 {
 439                     c = (unsigned char)((d >> (l -= 8)) % 256);
 440                     if (lsb)
 441                     {
 442                         if (buf)
 443                             *buf++ |= c;
 444                         len ++;
 445                     }
 446                     else
 447                         if (buf)
 448                             *buf = (wchar_t)(c << 8);
 449                 }
 450             }
 451             if (*psz == '-')
 452                 psz++;
 453         }
 454     }
 455     if (buf && (len < n))
 456         *buf = 0;
 457     return len;
 458 }
 459
 460 //
 461 // BASE64 encoding table
 462 //
 463 static const unsigned char utf7enb64[] =
 464 {
 465     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 466     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 467     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 468     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 469     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 470     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 471     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 472     '4', '5', '6', '7', '8', '9', '+', '/'
 473 };
 474
 475 //
 476 // UTF-7 encoding table
 477 //
 478 // 0 - Set D (directly encoded characters)
 479 // 1 - Set O (optional direct characters)
 480 // 2 - whitespace characters (optional)
 481 // 3 - special characters
 482 //
 483 static const unsigned char utf7encode[128] =
 484 {
 485     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 486     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 487     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 488     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 489     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 490     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 491     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 492     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 493 };
 494
 495 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t
 496 *psz, size_t n) const
 497 {
 498
 499
 500     size_t len = 0;
 501
 502     while (*psz && ((!buf) || (len < n)))
 503     {
 504         wchar_t cc = *psz++;
 505         if (cc < 0x80 && utf7encode[cc] < 1)
 506         {
 507             // plain ASCII char
 508             if (buf)
 509                 *buf++ = (char)cc;
 510             len++;
 511         }
 512 #ifndef WC_UTF16
 513         else if (((wxUint32)cc) > 0xffff)
 514             {
 515             // no surrogate pair generation (yet?)
 516             return (size_t)-1;
 517         }
 518 #endif
 519         else
 520         {
 521             if (buf)
 522                 *buf++ = '+';
 523             len++;
 524             if (cc != '+')
 525             {
 526                 // BASE64 encode string
 527                 unsigned int lsb, d, l;
 528                 for (d = 0, l = 0;; psz++)
 529                 {
 530                     for (lsb = 0; lsb < 2; lsb ++)
 531                     {
 532                         d <<= 8;
 533                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 534
 535                         for (l += 8; l >= 6; )
 536                         {
 537                             l -= 6;
 538                             if (buf)
 539                                 *buf++ = utf7enb64[(d >> l) % 64];
 540                             len++;
 541                         }
 542                     }
 543                     cc = *psz;
 544                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 545                         break;
 546                 }
 547                 if (l != 0)
 548                 {
 549                     if (buf)
 550                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 551                     len++;
 552                 }
 553             }
 554             if (buf)
 555                 *buf++ = '-';
 556             len++;
 557         }
 558     }
 559     if (buf && (len < n))
 560         *buf = 0;
 561     return len;
 562 }
 563
 564 // ----------------------------------------------------------------------------
 565 // UTF-8
 566 // ----------------------------------------------------------------------------
 567
 568 static wxUint32 utf8_max[]=
 569     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 570
 571 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 572 {
 573     size_t len = 0;
 574
 575     while (*psz && ((!buf) || (len < n)))
 576     {
 577         unsigned char cc = *psz++, fc = cc;
 578         unsigned cnt;
 579         for (cnt = 0; fc & 0x80; cnt++)
 580             fc <<= 1;
 581         if (!cnt)
 582         {
 583             // plain ASCII char
 584             if (buf)
 585                 *buf++ = cc;
 586             len++;
 587         }
 588         else
 589         {
 590             cnt--;
 591             if (!cnt)
 592             {
 593                 // invalid UTF-8 sequence
 594                 return (size_t)-1;
 595             }
 596             else
 597             {
 598                 unsigned ocnt = cnt - 1;
 599                 wxUint32 res = cc & (0x3f >> cnt);
 600                 while (cnt--)
 601                 {
 602                     cc = *psz++;
 603                     if ((cc & 0xC0) != 0x80)
 604                     {
 605                         // invalid UTF-8 sequence
 606                         return (size_t)-1;
 607                     }
 608                     res = (res << 6) | (cc & 0x3f);
 609                 }
 610                 if (res <= utf8_max[ocnt])
 611                 {
 612                     // illegal UTF-8 encoding
 613                     return (size_t)-1;
 614                 }
 615 #ifdef WC_UTF16
 616                 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 617                 size_t pa = encode_utf16(res, (wxUint16 *)buf);
 618                 if (pa == (size_t)-1)
 619                   return (size_t)-1;
 620                 if (buf)
 621                     buf += pa;
 622                 len += pa;
 623 #else // !WC_UTF16
 624                 if (buf)
 625                     *buf++ = res;
 626                 len++;
 627 #endif // WC_UTF16/!WC_UTF16
 628             }
 629         }
 630     }
 631     if (buf && (len < n))
 632         *buf = 0;
 633     return len;
 634 }
 635
 636 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 637 {
 638     size_t len = 0;
 639
 640     while (*psz && ((!buf) || (len < n)))
 641     {
 642         wxUint32 cc;
 643 #ifdef WC_UTF16
 644         // cast is ok for WC_UTF16
 645         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 646         psz += (pa == (size_t)-1) ? 1 : pa;
 647 #else
 648         cc=(*psz++) & 0x7fffffff;
 649 #endif
 650         unsigned cnt;
 651         for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 652         if (!cnt)
 653         {
 654             // plain ASCII char
 655             if (buf)
 656                 *buf++ = (char) cc;
 657             len++;
 658         }
 659
 660         else
 661         {
 662             len += cnt + 1;
 663             if (buf)
 664             {
 665                 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 666                 while (cnt--)
 667                     *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 668             }
 669         }
 670     }
 671
 672     if (buf && (len<n)) *buf = 0;
 673
 674     return len;
 675 }
 676
 677
 678
 679
 680 // ----------------------------------------------------------------------------
 681 // UTF-16
 682 // ----------------------------------------------------------------------------
 683
 684 #ifdef WORDS_BIGENDIAN
 685     #define wxMBConvUTF16straight wxMBConvUTF16BE
 686     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 687 #else
 688     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 689     #define wxMBConvUTF16straight wxMBConvUTF16LE
 690 #endif
 691
 692
 693 #ifdef WC_UTF16
 694
 695 // copy 16bit MB to 16bit String
 696 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 697 {
 698     size_t len=0;
 699
 700     while (*(wxUint16*)psz && (!buf || len < n))
 701     {
 702         if (buf)
 703             *buf++ = *(wxUint16*)psz;
 704         len++;
 705
 706         psz += sizeof(wxUint16);
 707     }
 708     if (buf && len<n)   *buf=0;
 709
 710     return len;
 711 }
 712
 713
 714 // copy 16bit String to 16bit MB
 715 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 716 {
 717     size_t len=0;
 718
 719     while (*psz && (!buf || len < n))
 720     {
 721         if (buf)
 722         {
 723             *(wxUint16*)buf = *psz;
 724             buf += sizeof(wxUint16);
 725         }
 726         len += sizeof(wxUint16);
 727         psz++;
 728     }
 729     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 730
 731     return len;
 732 }
 733
 734
 735 // swap 16bit MB to 16bit String
 736 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 737 {
 738     size_t len=0;
 739
 740     while (*(wxUint16*)psz && (!buf || len < n))
 741     {
 742         if (buf)
 743         {
 744             ((char *)buf)[0] = psz[1];
 745             ((char *)buf)[1] = psz[0];
 746             buf++;
 747         }
 748         len++;
 749         psz += sizeof(wxUint16);
 750     }
 751     if (buf && len<n)   *buf=0;
 752
 753     return len;
 754 }
 755
 756
 757 // swap 16bit MB to 16bit String
 758 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 759 {
 760     size_t len=0;
 761
 762     while (*psz && (!buf || len < n))
 763     {
 764         if (buf)
 765         {
 766             *buf++ = ((char*)psz)[1];
 767             *buf++ = ((char*)psz)[0];
 768         }
 769         len += sizeof(wxUint16);
 770         psz++;
 771     }
 772     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 773
 774     return len;
 775 }
 776
 777
 778 #else // WC_UTF16
 779
 780
 781 // copy 16bit MB to 32bit String
 782 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 783 {
 784     size_t len=0;
 785
 786     while (*(wxUint16*)psz && (!buf || len < n))
 787     {
 788         wxUint32 cc;
 789         size_t pa=decode_utf16((wxUint16*)psz, cc);
 790         if (pa == (size_t)-1)
 791             return pa;
 792
 793         if (buf)
 794             *buf++ = cc;
 795         len++;
 796         psz += pa * sizeof(wxUint16);
 797     }
 798     if (buf && len<n)   *buf=0;
 799
 800     return len;
 801 }
 802
 803
 804 // copy 32bit String to 16bit MB
 805 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 806 {
 807     size_t len=0;
 808
 809     while (*psz && (!buf || len < n))
 810     {
 811         wxUint16 cc[2];
 812         size_t pa=encode_utf16(*psz, cc);
 813
 814         if (pa == (size_t)-1)
 815             return pa;
 816
 817         if (buf)
 818         {
 819             *(wxUint16*)buf = cc[0];
 820             buf += sizeof(wxUint16);
 821             if (pa > 1)
 822             {
 823                 *(wxUint16*)buf = cc[1];
 824                 buf += sizeof(wxUint16);
 825             }
 826         }
 827
 828         len += pa*sizeof(wxUint16);
 829         psz++;
 830     }
 831     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 832
 833     return len;
 834 }
 835
 836
 837 // swap 16bit MB to 32bit String
 838 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 839 {
 840     size_t len=0;
 841
 842     while (*(wxUint16*)psz && (!buf || len < n))
 843     {
 844         wxUint32 cc;
 845         char tmp[4];
 846         tmp[0]=psz[1];  tmp[1]=psz[0];
 847         tmp[2]=psz[3];  tmp[3]=psz[2];
 848
 849         size_t pa=decode_utf16((wxUint16*)tmp, cc);
 850         if (pa == (size_t)-1)
 851             return pa;
 852
 853         if (buf)
 854             *buf++ = cc;
 855
 856         len++;
 857         psz += pa * sizeof(wxUint16);
 858     }
 859     if (buf && len<n)   *buf=0;
 860
 861     return len;
 862 }
 863
 864
 865 // swap 32bit String to 16bit MB
 866 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 867 {
 868     size_t len=0;
 869
 870     while (*psz && (!buf || len < n))
 871     {
 872         wxUint16 cc[2];
 873         size_t pa=encode_utf16(*psz, cc);
 874
 875         if (pa == (size_t)-1)
 876             return pa;
 877
 878         if (buf)
 879         {
 880             *buf++ = ((char*)cc)[1];
 881             *buf++ = ((char*)cc)[0];
 882             if (pa > 1)
 883             {
 884                 *buf++ = ((char*)cc)[3];
 885                 *buf++ = ((char*)cc)[2];
 886             }
 887         }
 888
 889         len += pa*sizeof(wxUint16);
 890         psz++;
 891     }
 892     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 893
 894     return len;
 895 }
 896
 897 #endif // WC_UTF16
 898
 899
 900 // ----------------------------------------------------------------------------
 901 // UTF-32
 902 // ----------------------------------------------------------------------------
 903
 904 #ifdef WORDS_BIGENDIAN
 905 #define wxMBConvUTF32straight  wxMBConvUTF32BE
 906 #define wxMBConvUTF32swap      wxMBConvUTF32LE
 907 #else
 908 #define wxMBConvUTF32swap      wxMBConvUTF32BE
 909 #define wxMBConvUTF32straight  wxMBConvUTF32LE
 910 #endif
 911
 912
 913 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
 914 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
 915
 916
 917 #ifdef WC_UTF16
 918
 919 // copy 32bit MB to 16bit String
 920 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 921 {
 922     size_t len=0;
 923
 924     while (*(wxUint32*)psz && (!buf || len < n))
 925     {
 926         wxUint16 cc[2];
 927
 928         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
 929         if (pa == (size_t)-1)
 930             return pa;
 931
 932         if (buf)
 933         {
 934             *buf++ = cc[0];
 935             if (pa > 1)
 936                 *buf++ = cc[1];
 937         }
 938         len += pa;
 939         psz += sizeof(wxUint32);
 940     }
 941     if (buf && len<n)   *buf=0;
 942
 943     return len;
 944 }
 945
 946
 947 // copy 16bit String to 32bit MB
 948 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 949 {
 950     size_t len=0;
 951
 952     while (*psz && (!buf || len < n))
 953     {
 954         wxUint32 cc;
 955
 956         // cast is ok for WC_UTF16
 957         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 958         if (pa == (size_t)-1)
 959             return pa;
 960
 961         if (buf)
 962         {
 963             *(wxUint32*)buf = cc;
 964             buf += sizeof(wxUint32);
 965         }
 966         len += sizeof(wxUint32);
 967         psz += pa;
 968     }
 969
 970     if (buf && len<=n-sizeof(wxUint32))
 971         *(wxUint32*)buf=0;
 972
 973     return len;
 974 }
 975
 976
 977
 978 // swap 32bit MB to 16bit String
 979 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 980 {
 981     size_t len=0;
 982
 983     while (*(wxUint32*)psz && (!buf || len < n))
 984     {
 985         char tmp[4];
 986         tmp[0] = psz[3];   tmp[1] = psz[2];
 987         tmp[2] = psz[1];   tmp[3] = psz[0];
 988
 989
 990         wxUint16 cc[2];
 991
 992         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
 993         if (pa == (size_t)-1)
 994             return pa;
 995
 996         if (buf)
 997         {
 998             *buf++ = cc[0];
 999             if (pa > 1)
1000                 *buf++ = cc[1];
1001         }
1002         len += pa;
1003         psz += sizeof(wxUint32);
1004     }
1005
1006     if (buf && len<n)
1007         *buf=0;
1008
1009     return len;
1010 }
1011
1012
1013 // swap 16bit String to 32bit MB
1014 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1015 {
1016     size_t len=0;
1017
1018     while (*psz && (!buf || len < n))
1019     {
1020         char cc[4];
1021
1022         // cast is ok for WC_UTF16
1023         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1024         if (pa == (size_t)-1)
1025             return pa;
1026
1027         if (buf)
1028         {
1029             *buf++ = cc[3];
1030             *buf++ = cc[2];
1031             *buf++ = cc[1];
1032             *buf++ = cc[0];
1033         }
1034         len += sizeof(wxUint32);
1035         psz += pa;
1036     }
1037
1038     if (buf && len<=n-sizeof(wxUint32))
1039         *(wxUint32*)buf=0;
1040
1041     return len;
1042 }
1043
1044 #else // WC_UTF16
1045
1046
1047 // copy 32bit MB to 32bit String
1048 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1049 {
1050     size_t len=0;
1051
1052     while (*(wxUint32*)psz && (!buf || len < n))
1053     {
1054         if (buf)
1055             *buf++ = *(wxUint32*)psz;
1056         len++;
1057         psz += sizeof(wxUint32);
1058     }
1059
1060     if (buf && len<n)
1061         *buf=0;
1062
1063     return len;
1064 }
1065
1066
1067 // copy 32bit String to 32bit MB
1068 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1069 {
1070     size_t len=0;
1071
1072     while (*psz && (!buf || len < n))
1073     {
1074         if (buf)
1075         {
1076             *(wxUint32*)buf = *psz;
1077             buf += sizeof(wxUint32);
1078         }
1079
1080         len += sizeof(wxUint32);
1081         psz++;
1082     }
1083
1084     if (buf && len<=n-sizeof(wxUint32))
1085         *(wxUint32*)buf=0;
1086
1087     return len;
1088 }
1089
1090
1091 // swap 32bit MB to 32bit String
1092 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1093 {
1094     size_t len=0;
1095
1096     while (*(wxUint32*)psz && (!buf || len < n))
1097     {
1098         if (buf)
1099         {
1100             ((char *)buf)[0] = psz[3];
1101             ((char *)buf)[1] = psz[2];
1102             ((char *)buf)[2] = psz[1];
1103             ((char *)buf)[3] = psz[0];
1104             buf++;
1105         }
1106         len++;
1107         psz += sizeof(wxUint32);
1108     }
1109
1110     if (buf && len<n)
1111         *buf=0;
1112
1113     return len;
1114 }
1115
1116
1117 // swap 32bit String to 32bit MB
1118 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1119 {
1120     size_t len=0;
1121
1122     while (*psz && (!buf || len < n))
1123     {
1124         if (buf)
1125         {
1126             *buf++ = ((char *)psz)[3];
1127             *buf++ = ((char *)psz)[2];
1128             *buf++ = ((char *)psz)[1];
1129             *buf++ = ((char *)psz)[0];
1130         }
1131         len += sizeof(wxUint32);
1132         psz++;
1133     }
1134
1135     if (buf && len<=n-sizeof(wxUint32))
1136         *(wxUint32*)buf=0;
1137
1138     return len;
1139 }
1140
1141
1142 #endif // WC_UTF16
1143
1144
1145 // ============================================================================
1146 // The classes doing conversion using the iconv_xxx() functions
1147 // ============================================================================
1148
1149 #ifdef HAVE_ICONV
1150
1151 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
1152 //     if output buffer is _exactly_ as big as needed. Such case is (unless there's
1153 //     yet another bug in glibc) the only case when iconv() returns with (size_t)-1
1154 //     (which means error) and says there are 0 bytes left in the input buffer --
1155 //     when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
1156 //     this alternative test for iconv() failure.
1157 //     [This bug does not appear in glibc 2.2.]
1158 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1159 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1160                                      (errno != E2BIG || bufLeft != 0))
1161 #else
1162 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1163 #endif
1164
1165 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1166
1167 // ----------------------------------------------------------------------------
1168 // wxMBConv_iconv: encapsulates an iconv character set
1169 // ----------------------------------------------------------------------------
1170
1171 class wxMBConv_iconv : public wxMBConv
1172 {
1173 public:
1174     wxMBConv_iconv(const wxChar *name);
1175     virtual ~wxMBConv_iconv();
1176
1177     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1178     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1179
1180     bool IsOk() const
1181         { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1182
1183 protected:
1184     // the iconv handlers used to translate from multibyte to wide char and in
1185     // the other direction
1186     iconv_t m2w,
1187             w2m;
1188
1189 private:
1190     // the name (for iconv_open()) of a wide char charset -- if none is
1191     // available on this machine, it will remain NULL
1192     static const char *ms_wcCharsetName;
1193
1194     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1195     // different endian-ness than the native one
1196     static bool ms_wcNeedsSwap;
1197 };
1198
1199 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1200 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1201
1202 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1203 {
1204     // Do it the hard way
1205     char cname[100];
1206     for (size_t i = 0; i < wxStrlen(name)+1; i++)
1207         cname[i] = (char) name[i];
1208
1209     // check for charset that represents wchar_t:
1210     if (ms_wcCharsetName == NULL)
1211     {
1212         ms_wcNeedsSwap = false;
1213
1214         // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1215         ms_wcCharsetName = WC_NAME_BEST;
1216         m2w = iconv_open(ms_wcCharsetName, cname);
1217
1218         if (m2w == (iconv_t)-1)
1219         {
1220             // try charset w/o bytesex info (e.g. "UCS4")
1221             // and check for bytesex ourselves:
1222             ms_wcCharsetName = WC_NAME;
1223             m2w = iconv_open(ms_wcCharsetName, cname);
1224
1225             // last bet, try if it knows WCHAR_T pseudo-charset
1226             if (m2w == (iconv_t)-1)
1227             {
1228                 ms_wcCharsetName = "WCHAR_T";
1229                 m2w = iconv_open(ms_wcCharsetName, cname);
1230             }
1231
1232             if (m2w != (iconv_t)-1)
1233             {
1234                 char    buf[2], *bufPtr;
1235                 wchar_t wbuf[2], *wbufPtr;
1236                 size_t  insz, outsz;
1237                 size_t  res;
1238
1239                 buf[0] = 'A';
1240                 buf[1] = 0;
1241                 wbuf[0] = 0;
1242                 insz = 2;
1243                 outsz = SIZEOF_WCHAR_T * 2;
1244                 wbufPtr = wbuf;
1245                 bufPtr = buf;
1246
1247                 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1248                             (char**)&wbufPtr, &outsz);
1249
1250                 if (ICONV_FAILED(res, insz))
1251                 {
1252                     ms_wcCharsetName = NULL;
1253                     wxLogLastError(wxT("iconv"));
1254                     wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1255                 }
1256                 else
1257                 {
1258                     ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1259                 }
1260             }
1261             else
1262             {
1263                 ms_wcCharsetName = NULL;
1264
1265                 // VS: we must not output an error here, since wxWidgets will safely
1266                 //     fall back to using wxEncodingConverter.
1267                 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1268                 //wxLogError(
1269             }
1270         }
1271         wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1272     }
1273     else // we already have ms_wcCharsetName
1274     {
1275         m2w = iconv_open(ms_wcCharsetName, cname);
1276     }
1277
1278     // NB: don't ever pass NULL to iconv_open(), it may crash!
1279     if ( ms_wcCharsetName )
1280     {
1281         w2m = iconv_open( cname, ms_wcCharsetName);
1282     }
1283     else
1284     {
1285         w2m = (iconv_t)-1;
1286     }
1287 }
1288
1289 wxMBConv_iconv::~wxMBConv_iconv()
1290 {
1291     if ( m2w != (iconv_t)-1 )
1292         iconv_close(m2w);
1293     if ( w2m != (iconv_t)-1 )
1294         iconv_close(w2m);
1295 }
1296
1297 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1298 {
1299     size_t inbuf = strlen(psz);
1300     size_t outbuf = n * SIZEOF_WCHAR_T;
1301     size_t res, cres;
1302     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1303     wchar_t *bufPtr = buf;
1304     const char *pszPtr = psz;
1305
1306     if (buf)
1307     {
1308         // have destination buffer, convert there
1309         cres = iconv(m2w,
1310                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1311                      (char**)&bufPtr, &outbuf);
1312         res = n - (outbuf / SIZEOF_WCHAR_T);
1313
1314         if (ms_wcNeedsSwap)
1315         {
1316             // convert to native endianness
1317             WC_BSWAP(buf /* _not_ bufPtr */, res)
1318         }
1319
1320         // NB: iconv was given only strlen(psz) characters on input, and so
1321         //     it couldn't convert the trailing zero. Let's do it ourselves
1322         //     if there's some room left for it in the output buffer.
1323         if (res < n)
1324             buf[res] = 0;
1325     }
1326     else
1327     {
1328         // no destination buffer... convert using temp buffer
1329         // to calculate destination buffer requirement
1330         wchar_t tbuf[8];
1331         res = 0;
1332         do {
1333             bufPtr = tbuf;
1334             outbuf = 8*SIZEOF_WCHAR_T;
1335
1336             cres = iconv(m2w,
1337                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1338                          (char**)&bufPtr, &outbuf );
1339
1340             res += 8-(outbuf/SIZEOF_WCHAR_T);
1341         } while ((cres==(size_t)-1) && (errno==E2BIG));
1342     }
1343
1344     if (ICONV_FAILED(cres, inbuf))
1345     {
1346         //VS: it is ok if iconv fails, hence trace only
1347         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1348         return (size_t)-1;
1349     }
1350
1351     return res;
1352 }
1353
1354 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1355 {
1356     size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1357     size_t outbuf = n;
1358     size_t res, cres;
1359
1360     wchar_t *tmpbuf = 0;
1361
1362     if (ms_wcNeedsSwap)
1363     {
1364         // need to copy to temp buffer to switch endianness
1365         // this absolutely doesn't rock!
1366         // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1367         //  could be in read-only memory, or be accessed in some other thread)
1368         tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1369         memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1370         WC_BSWAP(tmpbuf, inbuf)
1371         psz=tmpbuf;
1372     }
1373
1374     if (buf)
1375     {
1376         // have destination buffer, convert there
1377         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1378
1379         res = n-outbuf;
1380
1381         // NB: iconv was given only wcslen(psz) characters on input, and so
1382         //     it couldn't convert the trailing zero. Let's do it ourselves
1383         //     if there's some room left for it in the output buffer.
1384         if (res < n)
1385             buf[0] = 0;
1386     }
1387     else
1388     {
1389         // no destination buffer... convert using temp buffer
1390         // to calculate destination buffer requirement
1391         char tbuf[16];
1392         res = 0;
1393         do {
1394             buf = tbuf; outbuf = 16;
1395
1396             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1397
1398             res += 16 - outbuf;
1399         } while ((cres==(size_t)-1) && (errno==E2BIG));
1400     }
1401
1402     if (ms_wcNeedsSwap)
1403     {
1404         free(tmpbuf);
1405     }
1406
1407     if (ICONV_FAILED(cres, inbuf))
1408     {
1409         //VS: it is ok if iconv fails, hence trace only
1410         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1411         return (size_t)-1;
1412     }
1413
1414     return res;
1415 }
1416
1417 #endif // HAVE_ICONV
1418
1419
1420 // ============================================================================
1421 // Win32 conversion classes
1422 // ============================================================================
1423
1424 #ifdef wxHAVE_WIN32_MB2WC
1425
1426 // from utils.cpp
1427 #if wxUSE_FONTMAP
1428 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1429 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1430 #endif
1431
1432 class wxMBConv_win32 : public wxMBConv
1433 {
1434 public:
1435     wxMBConv_win32()
1436     {
1437         m_CodePage = CP_ACP;
1438     }
1439
1440 #if wxUSE_FONTMAP
1441     wxMBConv_win32(const wxChar* name)
1442     {
1443         m_CodePage = wxCharsetToCodepage(name);
1444     }
1445
1446     wxMBConv_win32(wxFontEncoding encoding)
1447     {
1448         m_CodePage = wxEncodingToCodepage(encoding);
1449     }
1450 #endif
1451
1452     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1453     {
1454         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1455         // the behaviour is not compatible with the Unix version (using iconv)
1456         // and break the library itself, e.g. wxTextInputStream::NextChar()
1457         // wouldn't work if reading an incomplete MB char didn't result in an
1458         // error
1459         const size_t len = ::MultiByteToWideChar
1460                              (
1461                                 m_CodePage,     // code page
1462                                 MB_ERR_INVALID_CHARS, // flags: fall on error
1463                                 psz,            // input string
1464                                 -1,             // its length (NUL-terminated)
1465                                 buf,            // output string
1466                                 buf ? n : 0     // size of output buffer
1467                              );
1468
1469         // note that it returns count of written chars for buf != NULL and size
1470         // of the needed buffer for buf == NULL so in either case the length of
1471         // the string (which never includes the terminating NUL) is one less
1472         return len ? len - 1 : (size_t)-1;
1473     }
1474
1475     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1476     {
1477         /*
1478             we have a problem here: by default, WideCharToMultiByte() may
1479             replace characters unrepresentable in the target code page with bad
1480             quality approximations such as turning "1/2" symbol (U+00BD) into
1481             "1" for the code pages which don't have it and we, obviously, want
1482             to avoid this at any price
1483
1484             the trouble is that this function does it _silently_, i.e. it won't
1485             even tell us whether it did or not... Win98/2000 and higher provide
1486             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1487             we have to resort to a round trip, i.e. check that converting back
1488             results in the same string -- this is, of course, expensive but
1489             otherwise we simply can't be sure to not garble the data.
1490          */
1491
1492         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1493         // it doesn't work with CJK encodings (which we test for rather roughly
1494         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1495         // supporting it
1496         BOOL usedDef wxDUMMY_INITIALIZE(false);
1497         BOOL *pUsedDef;
1498         int flags;
1499         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1500         {
1501             // it's our lucky day
1502             flags = WC_NO_BEST_FIT_CHARS;
1503             pUsedDef = &usedDef;
1504         }
1505         else // old system or unsupported encoding
1506         {
1507             flags = 0;
1508             pUsedDef = NULL;
1509         }
1510
1511         const size_t len = ::WideCharToMultiByte
1512                              (
1513                                 m_CodePage,     // code page
1514                                 flags,          // either none or no best fit
1515                                 pwz,            // input string
1516                                 -1,             // it is (wide) NUL-terminated
1517                                 buf,            // output buffer
1518                                 buf ? n : 0,    // and its size
1519                                 NULL,           // default "replacement" char
1520                                 pUsedDef        // [out] was it used?
1521                              );
1522
1523         if ( !len )
1524         {
1525             // function totally failed
1526             return (size_t)-1;
1527         }
1528
1529         // if we were really converting, check if we succeeded
1530         if ( buf )
1531         {
1532             if ( flags )
1533             {
1534                 // check if the conversion failed, i.e. if any replacements
1535                 // were done
1536                 if ( usedDef )
1537                     return (size_t)-1;
1538             }
1539             else // we must resort to double tripping...
1540             {
1541                 wxWCharBuffer wcBuf(n);
1542                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1543                         wcscmp(wcBuf, pwz) != 0 )
1544                 {
1545                     // we didn't obtain the same thing we started from, hence
1546                     // the conversion was lossy and we consider that it failed
1547                     return (size_t)-1;
1548                 }
1549             }
1550         }
1551
1552         // see the comment above for the reason of "len - 1"
1553         return len - 1;
1554     }
1555
1556     bool IsOk() const { return m_CodePage != -1; }
1557
1558 private:
1559     static bool CanUseNoBestFit()
1560     {
1561         static int s_isWin98Or2k = -1;
1562
1563         if ( s_isWin98Or2k == -1 )
1564         {
1565             int verMaj, verMin;
1566             switch ( wxGetOsVersion(&verMaj, &verMin) )
1567             {
1568                 case wxWIN95:
1569                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1570                     break;
1571
1572                 case wxWINDOWS_NT:
1573                     s_isWin98Or2k = verMaj >= 5;
1574                     break;
1575
1576                 default:
1577                     // unknown, be conseravtive by default
1578                     s_isWin98Or2k = 0;
1579             }
1580
1581             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1582         }
1583
1584         return s_isWin98Or2k == 1;
1585     }
1586
1587     long m_CodePage;
1588 };
1589
1590 #endif // wxHAVE_WIN32_MB2WC
1591
1592 // ============================================================================
1593 // Cocoa conversion classes
1594 // ============================================================================
1595
1596 #if defined(__WXCOCOA__)
1597
1598 // RN:  There is no UTF-32 support in either Core Foundation or
1599 // Cocoa.  Strangely enough, internally Core Foundation uses
1600 // UTF 32 internally quite a bit - its just not public (yet).
1601
1602 #include <CoreFoundation/CFString.h>
1603 #include <CoreFoundation/CFStringEncodingExt.h>
1604
1605 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1606 {
1607     CFStringEncoding enc = kCFStringEncodingInvalidId ;
1608     if ( encoding == wxFONTENCODING_DEFAULT )
1609     {
1610         enc = CFStringGetSystemEncoding();
1611     }
1612     else switch( encoding)
1613     {
1614         case wxFONTENCODING_ISO8859_1 :
1615             enc = kCFStringEncodingISOLatin1 ;
1616             break ;
1617         case wxFONTENCODING_ISO8859_2 :
1618             enc = kCFStringEncodingISOLatin2;
1619             break ;
1620         case wxFONTENCODING_ISO8859_3 :
1621             enc = kCFStringEncodingISOLatin3 ;
1622             break ;
1623         case wxFONTENCODING_ISO8859_4 :
1624             enc = kCFStringEncodingISOLatin4;
1625             break ;
1626         case wxFONTENCODING_ISO8859_5 :
1627             enc = kCFStringEncodingISOLatinCyrillic;
1628             break ;
1629         case wxFONTENCODING_ISO8859_6 :
1630             enc = kCFStringEncodingISOLatinArabic;
1631             break ;
1632         case wxFONTENCODING_ISO8859_7 :
1633             enc = kCFStringEncodingISOLatinGreek;
1634             break ;
1635         case wxFONTENCODING_ISO8859_8 :
1636             enc = kCFStringEncodingISOLatinHebrew;
1637             break ;
1638         case wxFONTENCODING_ISO8859_9 :
1639             enc = kCFStringEncodingISOLatin5;
1640             break ;
1641         case wxFONTENCODING_ISO8859_10 :
1642             enc = kCFStringEncodingISOLatin6;
1643             break ;
1644         case wxFONTENCODING_ISO8859_11 :
1645             enc = kCFStringEncodingISOLatinThai;
1646             break ;
1647         case wxFONTENCODING_ISO8859_13 :
1648             enc = kCFStringEncodingISOLatin7;
1649             break ;
1650         case wxFONTENCODING_ISO8859_14 :
1651             enc = kCFStringEncodingISOLatin8;
1652             break ;
1653         case wxFONTENCODING_ISO8859_15 :
1654             enc = kCFStringEncodingISOLatin9;
1655             break ;
1656
1657         case wxFONTENCODING_KOI8 :
1658             enc = kCFStringEncodingKOI8_R;
1659             break ;
1660         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1661             enc = kCFStringEncodingDOSRussian;
1662             break ;
1663
1664 //      case wxFONTENCODING_BULGARIAN :
1665 //          enc = ;
1666 //          break ;
1667
1668         case wxFONTENCODING_CP437 :
1669             enc =kCFStringEncodingDOSLatinUS ;
1670             break ;
1671         case wxFONTENCODING_CP850 :
1672             enc = kCFStringEncodingDOSLatin1;
1673             break ;
1674         case wxFONTENCODING_CP852 :
1675             enc = kCFStringEncodingDOSLatin2;
1676             break ;
1677         case wxFONTENCODING_CP855 :
1678             enc = kCFStringEncodingDOSCyrillic;
1679             break ;
1680         case wxFONTENCODING_CP866 :
1681             enc =kCFStringEncodingDOSRussian ;
1682             break ;
1683         case wxFONTENCODING_CP874 :
1684             enc = kCFStringEncodingDOSThai;
1685             break ;
1686         case wxFONTENCODING_CP932 :
1687             enc = kCFStringEncodingDOSJapanese;
1688             break ;
1689         case wxFONTENCODING_CP936 :
1690             enc =kCFStringEncodingDOSChineseSimplif ;
1691             break ;
1692         case wxFONTENCODING_CP949 :
1693             enc = kCFStringEncodingDOSKorean;
1694             break ;
1695         case wxFONTENCODING_CP950 :
1696             enc = kCFStringEncodingDOSChineseTrad;
1697             break ;
1698         case wxFONTENCODING_CP1250 :
1699             enc = kCFStringEncodingWindowsLatin2;
1700             break ;
1701         case wxFONTENCODING_CP1251 :
1702             enc =kCFStringEncodingWindowsCyrillic ;
1703             break ;
1704         case wxFONTENCODING_CP1252 :
1705             enc =kCFStringEncodingWindowsLatin1 ;
1706             break ;
1707         case wxFONTENCODING_CP1253 :
1708             enc = kCFStringEncodingWindowsGreek;
1709             break ;
1710         case wxFONTENCODING_CP1254 :
1711             enc = kCFStringEncodingWindowsLatin5;
1712             break ;
1713         case wxFONTENCODING_CP1255 :
1714             enc =kCFStringEncodingWindowsHebrew ;
1715             break ;
1716         case wxFONTENCODING_CP1256 :
1717             enc =kCFStringEncodingWindowsArabic ;
1718             break ;
1719         case wxFONTENCODING_CP1257 :
1720             enc = kCFStringEncodingWindowsBalticRim;
1721             break ;
1722 //   This only really encodes to UTF7 (if that) evidently
1723 //        case wxFONTENCODING_UTF7 :
1724 //            enc = kCFStringEncodingNonLossyASCII ;
1725 //            break ;
1726         case wxFONTENCODING_UTF8 :
1727             enc = kCFStringEncodingUTF8 ;
1728             break ;
1729         case wxFONTENCODING_EUC_JP :
1730             enc = kCFStringEncodingEUC_JP;
1731             break ;
1732         case wxFONTENCODING_UTF16 :
1733             enc = kCFStringEncodingUnicode ;
1734             break ;
1735         case wxFONTENCODING_MACROMAN :
1736             enc = kCFStringEncodingMacRoman ;
1737             break ;
1738         case wxFONTENCODING_MACJAPANESE :
1739             enc = kCFStringEncodingMacJapanese ;
1740             break ;
1741         case wxFONTENCODING_MACCHINESETRAD :
1742             enc = kCFStringEncodingMacChineseTrad ;
1743             break ;
1744         case wxFONTENCODING_MACKOREAN :
1745             enc = kCFStringEncodingMacKorean ;
1746             break ;
1747         case wxFONTENCODING_MACARABIC :
1748             enc = kCFStringEncodingMacArabic ;
1749             break ;
1750         case wxFONTENCODING_MACHEBREW :
1751             enc = kCFStringEncodingMacHebrew ;
1752             break ;
1753         case wxFONTENCODING_MACGREEK :
1754             enc = kCFStringEncodingMacGreek ;
1755             break ;
1756         case wxFONTENCODING_MACCYRILLIC :
1757             enc = kCFStringEncodingMacCyrillic ;
1758             break ;
1759         case wxFONTENCODING_MACDEVANAGARI :
1760             enc = kCFStringEncodingMacDevanagari ;
1761             break ;
1762         case wxFONTENCODING_MACGURMUKHI :
1763             enc = kCFStringEncodingMacGurmukhi ;
1764             break ;
1765         case wxFONTENCODING_MACGUJARATI :
1766             enc = kCFStringEncodingMacGujarati ;
1767             break ;
1768         case wxFONTENCODING_MACORIYA :
1769             enc = kCFStringEncodingMacOriya ;
1770             break ;
1771         case wxFONTENCODING_MACBENGALI :
1772             enc = kCFStringEncodingMacBengali ;
1773             break ;
1774         case wxFONTENCODING_MACTAMIL :
1775             enc = kCFStringEncodingMacTamil ;
1776             break ;
1777         case wxFONTENCODING_MACTELUGU :
1778             enc = kCFStringEncodingMacTelugu ;
1779             break ;
1780         case wxFONTENCODING_MACKANNADA :
1781             enc = kCFStringEncodingMacKannada ;
1782             break ;
1783         case wxFONTENCODING_MACMALAJALAM :
1784             enc = kCFStringEncodingMacMalayalam ;
1785             break ;
1786         case wxFONTENCODING_MACSINHALESE :
1787             enc = kCFStringEncodingMacSinhalese ;
1788             break ;
1789         case wxFONTENCODING_MACBURMESE :
1790             enc = kCFStringEncodingMacBurmese ;
1791             break ;
1792         case wxFONTENCODING_MACKHMER :
1793             enc = kCFStringEncodingMacKhmer ;
1794             break ;
1795         case wxFONTENCODING_MACTHAI :
1796             enc = kCFStringEncodingMacThai ;
1797             break ;
1798         case wxFONTENCODING_MACLAOTIAN :
1799             enc = kCFStringEncodingMacLaotian ;
1800             break ;
1801         case wxFONTENCODING_MACGEORGIAN :
1802             enc = kCFStringEncodingMacGeorgian ;
1803             break ;
1804         case wxFONTENCODING_MACARMENIAN :
1805             enc = kCFStringEncodingMacArmenian ;
1806             break ;
1807         case wxFONTENCODING_MACCHINESESIMP :
1808             enc = kCFStringEncodingMacChineseSimp ;
1809             break ;
1810         case wxFONTENCODING_MACTIBETAN :
1811             enc = kCFStringEncodingMacTibetan ;
1812             break ;
1813         case wxFONTENCODING_MACMONGOLIAN :
1814             enc = kCFStringEncodingMacMongolian ;
1815             break ;
1816         case wxFONTENCODING_MACETHIOPIC :
1817             enc = kCFStringEncodingMacEthiopic ;
1818             break ;
1819         case wxFONTENCODING_MACCENTRALEUR :
1820             enc = kCFStringEncodingMacCentralEurRoman ;
1821             break ;
1822         case wxFONTENCODING_MACVIATNAMESE :
1823             enc = kCFStringEncodingMacVietnamese ;
1824             break ;
1825         case wxFONTENCODING_MACARABICEXT :
1826             enc = kCFStringEncodingMacExtArabic ;
1827             break ;
1828         case wxFONTENCODING_MACSYMBOL :
1829             enc = kCFStringEncodingMacSymbol ;
1830             break ;
1831         case wxFONTENCODING_MACDINGBATS :
1832             enc = kCFStringEncodingMacDingbats ;
1833             break ;
1834         case wxFONTENCODING_MACTURKISH :
1835             enc = kCFStringEncodingMacTurkish ;
1836             break ;
1837         case wxFONTENCODING_MACCROATIAN :
1838             enc = kCFStringEncodingMacCroatian ;
1839             break ;
1840         case wxFONTENCODING_MACICELANDIC :
1841             enc = kCFStringEncodingMacIcelandic ;
1842             break ;
1843         case wxFONTENCODING_MACROMANIAN :
1844             enc = kCFStringEncodingMacRomanian ;
1845             break ;
1846         case wxFONTENCODING_MACCELTIC :
1847             enc = kCFStringEncodingMacCeltic ;
1848             break ;
1849         case wxFONTENCODING_MACGAELIC :
1850             enc = kCFStringEncodingMacGaelic ;
1851             break ;
1852 //      case wxFONTENCODING_MACKEYBOARD :
1853 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
1854 //          break ;
1855         default :
1856             // because gcc is picky
1857             break ;
1858     } ;
1859     return enc ;
1860 }
1861
1862 class wxMBConv_cocoa : public wxMBConv
1863 {
1864 public:
1865     wxMBConv_cocoa()
1866     {
1867         Init(CFStringGetSystemEncoding()) ;
1868     }
1869
1870     wxMBConv_cocoa(const wxChar* name)
1871     {
1872         Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
1873     }
1874
1875     wxMBConv_cocoa(wxFontEncoding encoding)
1876     {
1877         Init( wxCFStringEncFromFontEnc(encoding) );
1878     }
1879
1880     ~wxMBConv_cocoa()
1881     {
1882     }
1883
1884     void Init( CFStringEncoding encoding)
1885     {
1886         m_encoding = encoding ;
1887     }
1888
1889     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1890     {
1891         wxASSERT(szUnConv);
1892
1893         CFStringRef theString = CFStringCreateWithBytes (
1894                                                 NULL, //the allocator
1895                                                 (const UInt8*)szUnConv,
1896                                                 strlen(szUnConv),
1897                                                 m_encoding,
1898                                                 false //no BOM/external representation
1899                                                 );
1900
1901         wxASSERT(theString);
1902
1903         size_t nOutLength = CFStringGetLength(theString);
1904
1905         if (szOut == NULL)
1906         {
1907             CFRelease(theString);
1908             return nOutLength;
1909         }
1910
1911         CFRange theRange = { 0, nOutSize };
1912
1913 #if SIZEOF_WCHAR_T == 4
1914         UniChar* szUniCharBuffer = new UniChar[nOutSize];
1915 #endif
1916
1917         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
1918
1919         CFRelease(theString);
1920
1921         szUniCharBuffer[nOutLength] = '\0' ;
1922
1923 #if SIZEOF_WCHAR_T == 4
1924         wxMBConvUTF16 converter ;
1925         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
1926         delete[] szUniCharBuffer;
1927 #endif
1928
1929         return nOutLength;
1930     }
1931
1932     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1933     {
1934         wxASSERT(szUnConv);
1935
1936         size_t nRealOutSize;
1937         size_t nBufSize = wxWcslen(szUnConv);
1938         UniChar* szUniBuffer = (UniChar*) szUnConv;
1939
1940 #if SIZEOF_WCHAR_T == 4
1941         wxMBConvUTF16BE converter ;
1942         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
1943         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
1944         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
1945         nBufSize /= sizeof(UniChar);
1946 #endif
1947
1948         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
1949                                 NULL, //allocator
1950                                 szUniBuffer,
1951                                 nBufSize,
1952                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
1953                             );
1954
1955         wxASSERT(theString);
1956
1957         //Note that CER puts a BOM when converting to unicode
1958         //so we  check and use getchars instead in that case
1959         if (m_encoding == kCFStringEncodingUnicode)
1960         {
1961             if (szOut != NULL)
1962                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
1963
1964             nRealOutSize = CFStringGetLength(theString) + 1;
1965         }
1966         else
1967         {
1968             CFStringGetBytes(
1969                 theString,
1970                 CFRangeMake(0, CFStringGetLength(theString)),
1971                 m_encoding,
1972                 0, //what to put in characters that can't be converted -
1973                     //0 tells CFString to return NULL if it meets such a character
1974                 false, //not an external representation
1975                 (UInt8*) szOut,
1976                 nOutSize,
1977                 (CFIndex*) &nRealOutSize
1978                         );
1979         }
1980
1981         CFRelease(theString);
1982
1983 #if SIZEOF_WCHAR_T == 4
1984         delete[] szUniBuffer;
1985 #endif
1986
1987         return  nRealOutSize - 1;
1988     }
1989
1990     bool IsOk() const
1991     {
1992         return m_encoding != kCFStringEncodingInvalidId &&
1993               CFStringIsEncodingAvailable(m_encoding);
1994     }
1995
1996 private:
1997     CFStringEncoding m_encoding ;
1998 };
1999
2000 #endif // defined(__WXCOCOA__)
2001
2002 // ============================================================================
2003 // Mac conversion classes
2004 // ============================================================================
2005
2006 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2007
2008 class wxMBConv_mac : public wxMBConv
2009 {
2010 public:
2011     wxMBConv_mac()
2012     {
2013         Init(CFStringGetSystemEncoding()) ;
2014     }
2015
2016     wxMBConv_mac(const wxChar* name)
2017     {
2018         Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
2019     }
2020
2021     wxMBConv_mac(wxFontEncoding encoding)
2022     {
2023         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2024     }
2025
2026     ~wxMBConv_mac()
2027     {
2028         OSStatus status = noErr ;
2029         status = TECDisposeConverter(m_MB2WC_converter);
2030         status = TECDisposeConverter(m_WC2MB_converter);
2031     }
2032
2033
2034     void Init( TextEncodingBase encoding)
2035     {
2036         OSStatus status = noErr ;
2037         m_char_encoding = encoding ;
2038         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2039
2040         status = TECCreateConverter(&m_MB2WC_converter,
2041                                     m_char_encoding,
2042                                     m_unicode_encoding);
2043         status = TECCreateConverter(&m_WC2MB_converter,
2044                                     m_unicode_encoding,
2045                                     m_char_encoding);
2046     }
2047
2048     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2049     {
2050         OSStatus status = noErr ;
2051         ByteCount byteOutLen ;
2052         ByteCount byteInLen = strlen(psz) ;
2053         wchar_t *tbuf = NULL ;
2054         UniChar* ubuf = NULL ;
2055         size_t res = 0 ;
2056
2057         if (buf == NULL)
2058         {
2059             //apple specs say at least 32
2060             n = wxMax( 32 , byteInLen ) ;
2061             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2062         }
2063         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2064 #if SIZEOF_WCHAR_T == 4
2065         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2066 #else
2067         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2068 #endif
2069         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2070           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2071 #if SIZEOF_WCHAR_T == 4
2072         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2073         // is not properly terminated we get random characters at the end
2074         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2075         wxMBConvUTF16BE converter ;
2076         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2077         free( ubuf ) ;
2078 #else
2079         res = byteOutLen / sizeof( UniChar ) ;
2080 #endif
2081         if ( buf == NULL )
2082              free(tbuf) ;
2083
2084         if ( buf  && res < n)
2085             buf[res] = 0;
2086
2087         return res ;
2088     }
2089
2090     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2091     {
2092         OSStatus status = noErr ;
2093         ByteCount byteOutLen ;
2094         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2095
2096         char *tbuf = NULL ;
2097
2098         if (buf == NULL)
2099         {
2100             //apple specs say at least 32
2101             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2102             tbuf = (char*) malloc( n ) ;
2103         }
2104
2105         ByteCount byteBufferLen = n ;
2106         UniChar* ubuf = NULL ;
2107 #if SIZEOF_WCHAR_T == 4
2108         wxMBConvUTF16BE converter ;
2109         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2110         byteInLen = unicharlen ;
2111         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2112         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2113 #else
2114         ubuf = (UniChar*) psz ;
2115 #endif
2116         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2117             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2118 #if SIZEOF_WCHAR_T == 4
2119         free( ubuf ) ;
2120 #endif
2121         if ( buf == NULL )
2122             free(tbuf) ;
2123
2124         size_t res = byteOutLen ;
2125         if ( buf  && res < n)
2126         {
2127             buf[res] = 0;
2128
2129             //we need to double-trip to verify it didn't insert any ? in place
2130             //of bogus characters
2131             wxWCharBuffer wcBuf(n);
2132             size_t pszlen = wxWcslen(psz);
2133             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2134                         wxWcslen(wcBuf) != pszlen ||
2135                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2136             {
2137                 // we didn't obtain the same thing we started from, hence
2138                 // the conversion was lossy and we consider that it failed
2139                 return (size_t)-1;
2140             }
2141         }
2142
2143         return res ;
2144     }
2145
2146     bool IsOk() const
2147         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2148
2149 private:
2150     TECObjectRef m_MB2WC_converter ;
2151     TECObjectRef m_WC2MB_converter ;
2152
2153     TextEncodingBase m_char_encoding ;
2154     TextEncodingBase m_unicode_encoding ;
2155 };
2156
2157 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2158
2159 // ============================================================================
2160 // wxEncodingConverter based conversion classes
2161 // ============================================================================
2162
2163 #if wxUSE_FONTMAP
2164
2165 class wxMBConv_wxwin : public wxMBConv
2166 {
2167 private:
2168     void Init()
2169     {
2170         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2171                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2172     }
2173
2174 public:
2175     // temporarily just use wxEncodingConverter stuff,
2176     // so that it works while a better implementation is built
2177     wxMBConv_wxwin(const wxChar* name)
2178     {
2179         if (name)
2180             m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
2181         else
2182             m_enc = wxFONTENCODING_SYSTEM;
2183
2184         Init();
2185     }
2186
2187     wxMBConv_wxwin(wxFontEncoding enc)
2188     {
2189         m_enc = enc;
2190
2191         Init();
2192     }
2193
2194     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2195     {
2196         size_t inbuf = strlen(psz);
2197         if (buf)
2198             m2w.Convert(psz,buf);
2199         return inbuf;
2200     }
2201
2202     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2203     {
2204         const size_t inbuf = wxWcslen(psz);
2205         if (buf)
2206             w2m.Convert(psz,buf);
2207
2208         return inbuf;
2209     }
2210
2211     bool IsOk() const { return m_ok; }
2212
2213 public:
2214     wxFontEncoding m_enc;
2215     wxEncodingConverter m2w, w2m;
2216
2217     // were we initialized successfully?
2218     bool m_ok;
2219
2220     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2221 };
2222
2223 #endif // wxUSE_FONTMAP
2224
2225 // ============================================================================
2226 // wxCSConv implementation
2227 // ============================================================================
2228
2229 void wxCSConv::Init()
2230 {
2231     m_name = NULL;
2232     m_convReal =  NULL;
2233     m_deferred = true;
2234 }
2235
2236 wxCSConv::wxCSConv(const wxChar *charset)
2237 {
2238     Init();
2239
2240     if ( charset )
2241     {
2242         SetName(charset);
2243     }
2244
2245     m_encoding = wxFONTENCODING_SYSTEM;
2246 }
2247
2248 wxCSConv::wxCSConv(wxFontEncoding encoding)
2249 {
2250     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2251     {
2252         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2253
2254         encoding = wxFONTENCODING_SYSTEM;
2255     }
2256
2257     Init();
2258
2259     m_encoding = encoding;
2260 }
2261
2262 wxCSConv::~wxCSConv()
2263 {
2264     Clear();
2265 }
2266
2267 wxCSConv::wxCSConv(const wxCSConv& conv)
2268         : wxMBConv()
2269 {
2270     Init();
2271
2272     SetName(conv.m_name);
2273     m_encoding = conv.m_encoding;
2274 }
2275
2276 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2277 {
2278     Clear();
2279
2280     SetName(conv.m_name);
2281     m_encoding = conv.m_encoding;
2282
2283     return *this;
2284 }
2285
2286 void wxCSConv::Clear()
2287 {
2288     free(m_name);
2289     delete m_convReal;
2290
2291     m_name = NULL;
2292     m_convReal = NULL;
2293 }
2294
2295 void wxCSConv::SetName(const wxChar *charset)
2296 {
2297     if (charset)
2298     {
2299         m_name = wxStrdup(charset);
2300         m_deferred = true;
2301     }
2302 }
2303
2304 wxMBConv *wxCSConv::DoCreate() const
2305 {
2306     // check for the special case of ASCII or ISO8859-1 charset: as we have
2307     // special knowledge of it anyhow, we don't need to create a special
2308     // conversion object
2309     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2310     {
2311         // don't convert at all
2312         return NULL;
2313     }
2314
2315     // we trust OS to do conversion better than we can so try external
2316     // conversion methods first
2317     //
2318     // the full order is:
2319     //      1. OS conversion (iconv() under Unix or Win32 API)
2320     //      2. hard coded conversions for UTF
2321     //      3. wxEncodingConverter as fall back
2322
2323     // step (1)
2324 #ifdef HAVE_ICONV
2325 #if !wxUSE_FONTMAP
2326     if ( m_name )
2327 #endif // !wxUSE_FONTMAP
2328     {
2329         wxString name(m_name);
2330
2331 #if wxUSE_FONTMAP
2332         if ( name.empty() )
2333             name = wxFontMapper::Get()->GetEncodingName(m_encoding);
2334 #endif // wxUSE_FONTMAP
2335
2336         wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2337         if ( conv->IsOk() )
2338             return conv;
2339
2340         delete conv;
2341     }
2342 #endif // HAVE_ICONV
2343
2344 #ifdef wxHAVE_WIN32_MB2WC
2345     {
2346 #if wxUSE_FONTMAP
2347         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2348                                       : new wxMBConv_win32(m_encoding);
2349         if ( conv->IsOk() )
2350             return conv;
2351
2352         delete conv;
2353 #else
2354         return NULL;
2355 #endif
2356     }
2357 #endif // wxHAVE_WIN32_MB2WC
2358 #if defined(__WXMAC__)
2359     {
2360         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
2361         {
2362
2363             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2364                                         : new wxMBConv_mac(m_encoding);
2365             if ( conv->IsOk() )
2366                  return conv;
2367
2368             delete conv;
2369         }
2370     }
2371 #endif
2372 #if defined(__WXCOCOA__)
2373     {
2374         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2375         {
2376
2377             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2378                                           : new wxMBConv_cocoa(m_encoding);
2379             if ( conv->IsOk() )
2380                  return conv;
2381
2382             delete conv;
2383         }
2384     }
2385 #endif
2386     // step (2)
2387     wxFontEncoding enc = m_encoding;
2388 #if wxUSE_FONTMAP
2389     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2390     {
2391         // use "false" to suppress interactive dialogs -- we can be called from
2392         // anywhere and popping up a dialog from here is the last thing we want to
2393         // do
2394         enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
2395     }
2396 #endif // wxUSE_FONTMAP
2397
2398     switch ( enc )
2399     {
2400         case wxFONTENCODING_UTF7:
2401              return new wxMBConvUTF7;
2402
2403         case wxFONTENCODING_UTF8:
2404              return new wxMBConvUTF8;
2405
2406         case wxFONTENCODING_UTF16BE:
2407              return new wxMBConvUTF16BE;
2408
2409         case wxFONTENCODING_UTF16LE:
2410              return new wxMBConvUTF16LE;
2411
2412         case wxFONTENCODING_UTF32BE:
2413              return new wxMBConvUTF32BE;
2414
2415         case wxFONTENCODING_UTF32LE:
2416              return new wxMBConvUTF32LE;
2417
2418         default:
2419              // nothing to do but put here to suppress gcc warnings
2420              ;
2421     }
2422
2423     // step (3)
2424 #if wxUSE_FONTMAP
2425     {
2426         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2427                                       : new wxMBConv_wxwin(m_encoding);
2428         if ( conv->IsOk() )
2429             return conv;
2430
2431         delete conv;
2432     }
2433 #endif // wxUSE_FONTMAP
2434
2435     // NB: This is a hack to prevent deadlock. What could otherwise happen
2436     //     in Unicode build: wxConvLocal creation ends up being here
2437     //     because of some failure and logs the error. But wxLog will try to
2438     //     attach timestamp, for which it will need wxConvLocal (to convert
2439     //     time to char* and then wchar_t*), but that fails, tries to log
2440     //     error, but wxLog has a (already locked) critical section that
2441     //     guards static buffer.
2442     static bool alreadyLoggingError = false;
2443     if (!alreadyLoggingError)
2444     {
2445         alreadyLoggingError = true;
2446         wxLogError(_("Cannot convert from the charset '%s'!"),
2447                    m_name ? m_name
2448                       :
2449 #if wxUSE_FONTMAP
2450                          wxFontMapper::GetEncodingDescription(m_encoding).c_str()
2451 #else // !wxUSE_FONTMAP
2452                          wxString::Format(_("encoding %s"), m_encoding).c_str()
2453 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2454               );
2455         alreadyLoggingError = false;
2456     }
2457
2458     return NULL;
2459 }
2460
2461 void wxCSConv::CreateConvIfNeeded() const
2462 {
2463     if ( m_deferred )
2464     {
2465         wxCSConv *self = (wxCSConv *)this; // const_cast
2466
2467 #if wxUSE_INTL
2468         // if we don't have neither the name nor the encoding, use the default
2469         // encoding for this system
2470         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2471         {
2472             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2473         }
2474 #endif // wxUSE_INTL
2475
2476         self->m_convReal = DoCreate();
2477         self->m_deferred = false;
2478     }
2479 }
2480
2481 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2482 {
2483     CreateConvIfNeeded();
2484
2485     if (m_convReal)
2486         return m_convReal->MB2WC(buf, psz, n);
2487
2488     // latin-1 (direct)
2489     size_t len = strlen(psz);
2490
2491     if (buf)
2492     {
2493         for (size_t c = 0; c <= len; c++)
2494             buf[c] = (unsigned char)(psz[c]);
2495     }
2496
2497     return len;
2498 }
2499
2500 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2501 {
2502     CreateConvIfNeeded();
2503
2504     if (m_convReal)
2505         return m_convReal->WC2MB(buf, psz, n);
2506
2507     // latin-1 (direct)
2508     const size_t len = wxWcslen(psz);
2509     if (buf)
2510     {
2511         for (size_t c = 0; c <= len; c++)
2512         {
2513             if (psz[c] > 0xFF)
2514                 return (size_t)-1;
2515             buf[c] = (char)psz[c];
2516         }
2517     }
2518     else
2519     {
2520         for (size_t c = 0; c <= len; c++)
2521         {
2522             if (psz[c] > 0xFF)
2523                 return (size_t)-1;
2524         }
2525     }
2526
2527     return len;
2528 }
2529
2530 // ----------------------------------------------------------------------------
2531 // globals
2532 // ----------------------------------------------------------------------------
2533
2534 #ifdef __WINDOWS__
2535     static wxMBConv_win32 wxConvLibcObj;
2536 #elif defined(__WXMAC__) && !defined(__MACH__)
2537     static wxMBConv_mac wxConvLibcObj ;
2538 #else
2539     static wxMBConvLibc wxConvLibcObj;
2540 #endif
2541
2542 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2543 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2544 static wxMBConvUTF7 wxConvUTF7Obj;
2545 static wxMBConvUTF8 wxConvUTF8Obj;
2546
2547
2548 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2549 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2550 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2551 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2552 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2553 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2554
2555 #else // !wxUSE_WCHAR_T
2556
2557 // stand-ins in absence of wchar_t
2558 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2559                                 wxConvISO8859_1,
2560                                 wxConvLocal,
2561                                 wxConvUTF8;
2562
2563 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2564
2565