src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
  24   #pragma implementation "strconv.h"
  25 #endif
  26
  27 // For compilers that support precompilation, includes "wx.h".
  28 #include "wx/wxprec.h"
  29
  30 #ifdef __BORLANDC__
  31   #pragma hdrstop
  32 #endif
  33
  34 #ifndef WX_PRECOMP
  35     #include "wx/intl.h"
  36     #include "wx/log.h"
  37 #endif // WX_PRECOMP
  38
  39 #include "wx/strconv.h"
  40
  41 #if wxUSE_WCHAR_T
  42
  43 #ifdef __WXMSW__
  44     #include "wx/msw/private.h"
  45 #endif
  46
  47 #ifdef __WINDOWS__
  48     #include "wx/msw/missing.h"
  49 #endif
  50
  51 #ifndef __WXWINCE__
  52 #include <errno.h>
  53 #endif
  54
  55 #include <ctype.h>
  56 #include <string.h>
  57 #include <stdlib.h>
  58
  59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  60     #define wxHAVE_WIN32_MB2WC
  61 #endif // __WIN32__ but !__WXMICROWIN__
  62
  63 // ----------------------------------------------------------------------------
  64 // headers
  65 // ----------------------------------------------------------------------------
  66
  67 #ifdef __SALFORDC__
  68     #include <clib.h>
  69 #endif
  70
  71 #ifdef HAVE_ICONV
  72     #include <iconv.h>
  73     #include "wx/thread.h"
  74 #endif
  75
  76 #include "wx/encconv.h"
  77 #include "wx/fontmap.h"
  78 #include "wx/utils.h"
  79
  80 #ifdef __WXMAC__
  81 #include <ATSUnicode.h>
  82 #include <TextCommon.h>
  83 #include <TextEncodingConverter.h>
  84
  85 #include  "wx/mac/private.h"  // includes mac headers
  86 #endif
  87 // ----------------------------------------------------------------------------
  88 // macros
  89 // ----------------------------------------------------------------------------
  90
  91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
  92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
  93
  94 #if SIZEOF_WCHAR_T == 4
  95     #define WC_NAME         "UCS4"
  96     #define WC_BSWAP         BSWAP_UCS4
  97     #ifdef WORDS_BIGENDIAN
  98       #define WC_NAME_BEST  "UCS-4BE"
  99     #else
 100       #define WC_NAME_BEST  "UCS-4LE"
 101     #endif
 102 #elif SIZEOF_WCHAR_T == 2
 103     #define WC_NAME         "UTF16"
 104     #define WC_BSWAP         BSWAP_UTF16
 105     #define WC_UTF16
 106     #ifdef WORDS_BIGENDIAN
 107       #define WC_NAME_BEST  "UTF-16BE"
 108     #else
 109       #define WC_NAME_BEST  "UTF-16LE"
 110     #endif
 111 #else // sizeof(wchar_t) != 2 nor 4
 112     // does this ever happen?
 113     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
 114 #endif
 115
 116 // ============================================================================
 117 // implementation
 118 // ============================================================================
 119
 120 // ----------------------------------------------------------------------------
 121 // UTF-16 en/decoding to/from UCS-4
 122 // ----------------------------------------------------------------------------
 123
 124
 125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 126 {
 127     if (input<=0xffff)
 128     {
 129         if (output)
 130             *output = (wxUint16) input;
 131         return 1;
 132     }
 133     else if (input>=0x110000)
 134     {
 135         return (size_t)-1;
 136     }
 137     else
 138     {
 139         if (output)
 140         {
 141             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 142             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 143         }
 144         return 2;
 145     }
 146 }
 147
 148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 149 {
 150     if ((*input<0xd800) || (*input>0xdfff))
 151     {
 152         output = *input;
 153         return 1;
 154     }
 155     else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
 156     {
 157         output = *input;
 158         return (size_t)-1;
 159     }
 160     else
 161     {
 162         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 163         return 2;
 164     }
 165 }
 166
 167
 168 // ----------------------------------------------------------------------------
 169 // wxMBConv
 170 // ----------------------------------------------------------------------------
 171
 172 wxMBConv::~wxMBConv()
 173 {
 174     // nothing to do here (necessary for Darwin linking probably)
 175 }
 176
 177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 178 {
 179     if ( psz )
 180     {
 181         // calculate the length of the buffer needed first
 182         size_t nLen = MB2WC(NULL, psz, 0);
 183         if ( nLen != (size_t)-1 )
 184         {
 185             // now do the actual conversion
 186             wxWCharBuffer buf(nLen);
 187             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 188             if ( nLen != (size_t)-1 )
 189             {
 190                 return buf;
 191             }
 192         }
 193     }
 194
 195     wxWCharBuffer buf((wchar_t *)NULL);
 196
 197     return buf;
 198 }
 199
 200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 201 {
 202     if ( pwz )
 203     {
 204         size_t nLen = WC2MB(NULL, pwz, 0);
 205         if ( nLen != (size_t)-1 )
 206         {
 207             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 208             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 209             if ( nLen != (size_t)-1 )
 210             {
 211                 return buf;
 212             }
 213         }
 214     }
 215
 216     wxCharBuffer buf((char *)NULL);
 217
 218     return buf;
 219 }
 220
 221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
 222 {
 223     wxASSERT(pOutSize != NULL);
 224
 225     const char* szEnd = szString + nStringLen + 1;
 226     const char* szPos = szString;
 227     const char* szStart = szPos;
 228
 229     size_t nActualLength = 0;
 230     size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
 231
 232     wxWCharBuffer theBuffer(nCurrentSize);
 233
 234     //Convert the string until the length() is reached, continuing the
 235     //loop every time a null character is reached
 236     while(szPos != szEnd)
 237     {
 238         wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
 239
 240         //Get the length of the current (sub)string
 241         size_t nLen = MB2WC(NULL, szPos, 0);
 242
 243         //Invalid conversion?
 244         if( nLen == (size_t)-1 )
 245         {
 246             *pOutSize = 0;
 247             theBuffer.data()[0u] = wxT('\0');
 248             return theBuffer;
 249         }
 250
 251
 252         //Increase the actual length (+1 for current null character)
 253         nActualLength += nLen + 1;
 254
 255         //if buffer too big, realloc the buffer
 256         if (nActualLength > (nCurrentSize+1))
 257         {
 258             wxWCharBuffer theNewBuffer(nCurrentSize << 1);
 259             memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
 260             theBuffer = theNewBuffer;
 261             nCurrentSize <<= 1;
 262         }
 263
 264         //Convert the current (sub)string
 265         if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
 266         {
 267             *pOutSize = 0;
 268             theBuffer.data()[0u] = wxT('\0');
 269             return theBuffer;
 270         }
 271
 272         //Increment to next (sub)string
 273         //Note that we have to use strlen here instead of nLen
 274         //here because XX2XX gives us the size of the output buffer,
 275         //not neccessarly the length of the string
 276         szPos += strlen(szPos) + 1;
 277     }
 278
 279     //success - return actual length and the buffer
 280     *pOutSize = nActualLength;
 281     return theBuffer;
 282 }
 283
 284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
 285 {
 286     wxASSERT(pOutSize != NULL);
 287
 288     const wchar_t* szEnd = szString + nStringLen + 1;
 289     const wchar_t* szPos = szString;
 290     const wchar_t* szStart = szPos;
 291
 292     size_t nActualLength = 0;
 293     size_t nCurrentSize = nStringLen << 2; //try * 4 first
 294
 295     wxCharBuffer theBuffer(nCurrentSize);
 296
 297     //Convert the string until the length() is reached, continuing the
 298     //loop every time a null character is reached
 299     while(szPos != szEnd)
 300     {
 301         wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
 302
 303         //Get the length of the current (sub)string
 304         size_t nLen = WC2MB(NULL, szPos, 0);
 305
 306         //Invalid conversion?
 307         if( nLen == (size_t)-1 )
 308         {
 309             *pOutSize = 0;
 310             theBuffer.data()[0u] = wxT('\0');
 311             return theBuffer;
 312         }
 313
 314         //Increase the actual length (+1 for current null character)
 315         nActualLength += nLen + 1;
 316
 317         //if buffer too big, realloc the buffer
 318         if (nActualLength > (nCurrentSize+1))
 319         {
 320             wxCharBuffer theNewBuffer(nCurrentSize << 1);
 321             memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
 322             theBuffer = theNewBuffer;
 323             nCurrentSize <<= 1;
 324         }
 325
 326         //Convert the current (sub)string
 327         if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
 328         {
 329             *pOutSize = 0;
 330             theBuffer.data()[0u] = wxT('\0');
 331             return theBuffer;
 332         }
 333
 334         //Increment to next (sub)string
 335         //Note that we have to use wxWcslen here instead of nLen
 336         //here because XX2XX gives us the size of the output buffer,
 337         //not neccessarly the length of the string
 338         szPos += wxWcslen(szPos) + 1;
 339     }
 340
 341     //success - return actual length and the buffer
 342     *pOutSize = nActualLength;
 343     return theBuffer;
 344 }
 345
 346 // ----------------------------------------------------------------------------
 347 // wxMBConvLibc
 348 // ----------------------------------------------------------------------------
 349
 350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 351 {
 352     return wxMB2WC(buf, psz, n);
 353 }
 354
 355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 356 {
 357     return wxWC2MB(buf, psz, n);
 358 }
 359 // ----------------------------------------------------------------------------
 360 // UTF-7
 361 // ----------------------------------------------------------------------------
 362
 363 // Implementation (C) 2004 Fredrik Roubert
 364
 365 //
 366 // BASE64 decoding table
 367 //
 368 static const unsigned char utf7unb64[] =
 369 {
 370     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 371     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 372     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 373     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 374     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 375     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 376     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 377     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 378     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 379     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 380     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 381     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 382     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 383     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 384     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 385     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 386     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 387     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 388     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 389     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 390     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 391     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 392     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 393     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 394     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 395     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 396     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 397     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 398     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 399     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 400     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 401     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 402 };
 403
 404 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 405 {
 406     size_t len = 0;
 407
 408     while (*psz && ((!buf) || (len < n)))
 409     {
 410         unsigned char cc = *psz++;
 411         if (cc != '+')
 412         {
 413             // plain ASCII char
 414             if (buf)
 415                 *buf++ = cc;
 416             len++;
 417         }
 418         else if (*psz == '-')
 419         {
 420             // encoded plus sign
 421             if (buf)
 422                 *buf++ = cc;
 423             len++;
 424             psz++;
 425         }
 426         else
 427         {
 428             // BASE64 encoded string
 429             bool lsb;
 430             unsigned char c;
 431             unsigned int d, l;
 432             for (lsb = false, d = 0, l = 0;
 433                 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
 434             {
 435                 d <<= 6;
 436                 d += cc;
 437                 for (l += 6; l >= 8; lsb = !lsb)
 438                 {
 439                     c = (unsigned char)((d >> (l -= 8)) % 256);
 440                     if (lsb)
 441                     {
 442                         if (buf)
 443                             *buf++ |= c;
 444                         len ++;
 445                     }
 446                     else
 447                         if (buf)
 448                             *buf = (wchar_t)(c << 8);
 449                 }
 450             }
 451             if (*psz == '-')
 452                 psz++;
 453         }
 454     }
 455     if (buf && (len < n))
 456         *buf = 0;
 457     return len;
 458 }
 459
 460 //
 461 // BASE64 encoding table
 462 //
 463 static const unsigned char utf7enb64[] =
 464 {
 465     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 466     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 467     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 468     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 469     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 470     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 471     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 472     '4', '5', '6', '7', '8', '9', '+', '/'
 473 };
 474
 475 //
 476 // UTF-7 encoding table
 477 //
 478 // 0 - Set D (directly encoded characters)
 479 // 1 - Set O (optional direct characters)
 480 // 2 - whitespace characters (optional)
 481 // 3 - special characters
 482 //
 483 static const unsigned char utf7encode[128] =
 484 {
 485     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 486     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 487     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 488     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 489     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 490     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 491     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 492     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 493 };
 494
 495 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 496 {
 497
 498
 499     size_t len = 0;
 500
 501     while (*psz && ((!buf) || (len < n)))
 502     {
 503         wchar_t cc = *psz++;
 504         if (cc < 0x80 && utf7encode[cc] < 1)
 505         {
 506             // plain ASCII char
 507             if (buf)
 508                 *buf++ = (char)cc;
 509             len++;
 510         }
 511 #ifndef WC_UTF16
 512         else if (((wxUint32)cc) > 0xffff)
 513             {
 514             // no surrogate pair generation (yet?)
 515             return (size_t)-1;
 516         }
 517 #endif
 518         else
 519         {
 520             if (buf)
 521                 *buf++ = '+';
 522             len++;
 523             if (cc != '+')
 524             {
 525                 // BASE64 encode string
 526                 unsigned int lsb, d, l;
 527                 for (d = 0, l = 0;; psz++)
 528                 {
 529                     for (lsb = 0; lsb < 2; lsb ++)
 530                     {
 531                         d <<= 8;
 532                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 533
 534                         for (l += 8; l >= 6; )
 535                         {
 536                             l -= 6;
 537                             if (buf)
 538                                 *buf++ = utf7enb64[(d >> l) % 64];
 539                             len++;
 540                         }
 541                     }
 542                     cc = *psz;
 543                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 544                         break;
 545                 }
 546                 if (l != 0)
 547                 {
 548                     if (buf)
 549                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 550                     len++;
 551                 }
 552             }
 553             if (buf)
 554                 *buf++ = '-';
 555             len++;
 556         }
 557     }
 558     if (buf && (len < n))
 559         *buf = 0;
 560     return len;
 561 }
 562
 563 // ----------------------------------------------------------------------------
 564 // UTF-8
 565 // ----------------------------------------------------------------------------
 566
 567 static wxUint32 utf8_max[]=
 568     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 569
 570 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 571 {
 572     size_t len = 0;
 573
 574     while (*psz && ((!buf) || (len < n)))
 575     {
 576         unsigned char cc = *psz++, fc = cc;
 577         unsigned cnt;
 578         for (cnt = 0; fc & 0x80; cnt++)
 579             fc <<= 1;
 580         if (!cnt)
 581         {
 582             // plain ASCII char
 583             if (buf)
 584                 *buf++ = cc;
 585             len++;
 586         }
 587         else
 588         {
 589             cnt--;
 590             if (!cnt)
 591             {
 592                 // invalid UTF-8 sequence
 593                 return (size_t)-1;
 594             }
 595             else
 596             {
 597                 unsigned ocnt = cnt - 1;
 598                 wxUint32 res = cc & (0x3f >> cnt);
 599                 while (cnt--)
 600                 {
 601                     cc = *psz++;
 602                     if ((cc & 0xC0) != 0x80)
 603                     {
 604                         // invalid UTF-8 sequence
 605                         return (size_t)-1;
 606                     }
 607                     res = (res << 6) | (cc & 0x3f);
 608                 }
 609                 if (res <= utf8_max[ocnt])
 610                 {
 611                     // illegal UTF-8 encoding
 612                     return (size_t)-1;
 613                 }
 614 #ifdef WC_UTF16
 615                 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 616                 size_t pa = encode_utf16(res, (wxUint16 *)buf);
 617                 if (pa == (size_t)-1)
 618                   return (size_t)-1;
 619                 if (buf)
 620                     buf += pa;
 621                 len += pa;
 622 #else // !WC_UTF16
 623                 if (buf)
 624                     *buf++ = res;
 625                 len++;
 626 #endif // WC_UTF16/!WC_UTF16
 627             }
 628         }
 629     }
 630     if (buf && (len < n))
 631         *buf = 0;
 632     return len;
 633 }
 634
 635 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 636 {
 637     size_t len = 0;
 638
 639     while (*psz && ((!buf) || (len < n)))
 640     {
 641         wxUint32 cc;
 642 #ifdef WC_UTF16
 643         // cast is ok for WC_UTF16
 644         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 645         psz += (pa == (size_t)-1) ? 1 : pa;
 646 #else
 647         cc=(*psz++) & 0x7fffffff;
 648 #endif
 649         unsigned cnt;
 650         for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 651         if (!cnt)
 652         {
 653             // plain ASCII char
 654             if (buf)
 655                 *buf++ = (char) cc;
 656             len++;
 657         }
 658
 659         else
 660         {
 661             len += cnt + 1;
 662             if (buf)
 663             {
 664                 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 665                 while (cnt--)
 666                     *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 667             }
 668         }
 669     }
 670
 671     if (buf && (len<n)) *buf = 0;
 672
 673     return len;
 674 }
 675
 676
 677
 678
 679 // ----------------------------------------------------------------------------
 680 // UTF-16
 681 // ----------------------------------------------------------------------------
 682
 683 #ifdef WORDS_BIGENDIAN
 684     #define wxMBConvUTF16straight wxMBConvUTF16BE
 685     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 686 #else
 687     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 688     #define wxMBConvUTF16straight wxMBConvUTF16LE
 689 #endif
 690
 691
 692 #ifdef WC_UTF16
 693
 694 // copy 16bit MB to 16bit String
 695 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 696 {
 697     size_t len=0;
 698
 699     while (*(wxUint16*)psz && (!buf || len < n))
 700     {
 701         if (buf)
 702             *buf++ = *(wxUint16*)psz;
 703         len++;
 704
 705         psz += sizeof(wxUint16);
 706     }
 707     if (buf && len<n)   *buf=0;
 708
 709     return len;
 710 }
 711
 712
 713 // copy 16bit String to 16bit MB
 714 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 715 {
 716     size_t len=0;
 717
 718     while (*psz && (!buf || len < n))
 719     {
 720         if (buf)
 721         {
 722             *(wxUint16*)buf = *psz;
 723             buf += sizeof(wxUint16);
 724         }
 725         len += sizeof(wxUint16);
 726         psz++;
 727     }
 728     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 729
 730     return len;
 731 }
 732
 733
 734 // swap 16bit MB to 16bit String
 735 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 736 {
 737     size_t len=0;
 738
 739     while (*(wxUint16*)psz && (!buf || len < n))
 740     {
 741         if (buf)
 742         {
 743             ((char *)buf)[0] = psz[1];
 744             ((char *)buf)[1] = psz[0];
 745             buf++;
 746         }
 747         len++;
 748         psz += sizeof(wxUint16);
 749     }
 750     if (buf && len<n)   *buf=0;
 751
 752     return len;
 753 }
 754
 755
 756 // swap 16bit MB to 16bit String
 757 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 758 {
 759     size_t len=0;
 760
 761     while (*psz && (!buf || len < n))
 762     {
 763         if (buf)
 764         {
 765             *buf++ = ((char*)psz)[1];
 766             *buf++ = ((char*)psz)[0];
 767         }
 768         len += sizeof(wxUint16);
 769         psz++;
 770     }
 771     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 772
 773     return len;
 774 }
 775
 776
 777 #else // WC_UTF16
 778
 779
 780 // copy 16bit MB to 32bit String
 781 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 782 {
 783     size_t len=0;
 784
 785     while (*(wxUint16*)psz && (!buf || len < n))
 786     {
 787         wxUint32 cc;
 788         size_t pa=decode_utf16((wxUint16*)psz, cc);
 789         if (pa == (size_t)-1)
 790             return pa;
 791
 792         if (buf)
 793             *buf++ = cc;
 794         len++;
 795         psz += pa * sizeof(wxUint16);
 796     }
 797     if (buf && len<n)   *buf=0;
 798
 799     return len;
 800 }
 801
 802
 803 // copy 32bit String to 16bit MB
 804 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 805 {
 806     size_t len=0;
 807
 808     while (*psz && (!buf || len < n))
 809     {
 810         wxUint16 cc[2];
 811         size_t pa=encode_utf16(*psz, cc);
 812
 813         if (pa == (size_t)-1)
 814             return pa;
 815
 816         if (buf)
 817         {
 818             *(wxUint16*)buf = cc[0];
 819             buf += sizeof(wxUint16);
 820             if (pa > 1)
 821             {
 822                 *(wxUint16*)buf = cc[1];
 823                 buf += sizeof(wxUint16);
 824             }
 825         }
 826
 827         len += pa*sizeof(wxUint16);
 828         psz++;
 829     }
 830     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 831
 832     return len;
 833 }
 834
 835
 836 // swap 16bit MB to 32bit String
 837 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 838 {
 839     size_t len=0;
 840
 841     while (*(wxUint16*)psz && (!buf || len < n))
 842     {
 843         wxUint32 cc;
 844         char tmp[4];
 845         tmp[0]=psz[1];  tmp[1]=psz[0];
 846         tmp[2]=psz[3];  tmp[3]=psz[2];
 847
 848         size_t pa=decode_utf16((wxUint16*)tmp, cc);
 849         if (pa == (size_t)-1)
 850             return pa;
 851
 852         if (buf)
 853             *buf++ = cc;
 854
 855         len++;
 856         psz += pa * sizeof(wxUint16);
 857     }
 858     if (buf && len<n)   *buf=0;
 859
 860     return len;
 861 }
 862
 863
 864 // swap 32bit String to 16bit MB
 865 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 866 {
 867     size_t len=0;
 868
 869     while (*psz && (!buf || len < n))
 870     {
 871         wxUint16 cc[2];
 872         size_t pa=encode_utf16(*psz, cc);
 873
 874         if (pa == (size_t)-1)
 875             return pa;
 876
 877         if (buf)
 878         {
 879             *buf++ = ((char*)cc)[1];
 880             *buf++ = ((char*)cc)[0];
 881             if (pa > 1)
 882             {
 883                 *buf++ = ((char*)cc)[3];
 884                 *buf++ = ((char*)cc)[2];
 885             }
 886         }
 887
 888         len += pa*sizeof(wxUint16);
 889         psz++;
 890     }
 891     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 892
 893     return len;
 894 }
 895
 896 #endif // WC_UTF16
 897
 898
 899 // ----------------------------------------------------------------------------
 900 // UTF-32
 901 // ----------------------------------------------------------------------------
 902
 903 #ifdef WORDS_BIGENDIAN
 904 #define wxMBConvUTF32straight  wxMBConvUTF32BE
 905 #define wxMBConvUTF32swap      wxMBConvUTF32LE
 906 #else
 907 #define wxMBConvUTF32swap      wxMBConvUTF32BE
 908 #define wxMBConvUTF32straight  wxMBConvUTF32LE
 909 #endif
 910
 911
 912 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
 913 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
 914
 915
 916 #ifdef WC_UTF16
 917
 918 // copy 32bit MB to 16bit String
 919 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 920 {
 921     size_t len=0;
 922
 923     while (*(wxUint32*)psz && (!buf || len < n))
 924     {
 925         wxUint16 cc[2];
 926
 927         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
 928         if (pa == (size_t)-1)
 929             return pa;
 930
 931         if (buf)
 932         {
 933             *buf++ = cc[0];
 934             if (pa > 1)
 935                 *buf++ = cc[1];
 936         }
 937         len += pa;
 938         psz += sizeof(wxUint32);
 939     }
 940     if (buf && len<n)   *buf=0;
 941
 942     return len;
 943 }
 944
 945
 946 // copy 16bit String to 32bit MB
 947 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 948 {
 949     size_t len=0;
 950
 951     while (*psz && (!buf || len < n))
 952     {
 953         wxUint32 cc;
 954
 955         // cast is ok for WC_UTF16
 956         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 957         if (pa == (size_t)-1)
 958             return pa;
 959
 960         if (buf)
 961         {
 962             *(wxUint32*)buf = cc;
 963             buf += sizeof(wxUint32);
 964         }
 965         len += sizeof(wxUint32);
 966         psz += pa;
 967     }
 968
 969     if (buf && len<=n-sizeof(wxUint32))
 970         *(wxUint32*)buf=0;
 971
 972     return len;
 973 }
 974
 975
 976
 977 // swap 32bit MB to 16bit String
 978 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 979 {
 980     size_t len=0;
 981
 982     while (*(wxUint32*)psz && (!buf || len < n))
 983     {
 984         char tmp[4];
 985         tmp[0] = psz[3];   tmp[1] = psz[2];
 986         tmp[2] = psz[1];   tmp[3] = psz[0];
 987
 988
 989         wxUint16 cc[2];
 990
 991         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
 992         if (pa == (size_t)-1)
 993             return pa;
 994
 995         if (buf)
 996         {
 997             *buf++ = cc[0];
 998             if (pa > 1)
 999                 *buf++ = cc[1];
1000         }
1001         len += pa;
1002         psz += sizeof(wxUint32);
1003     }
1004
1005     if (buf && len<n)
1006         *buf=0;
1007
1008     return len;
1009 }
1010
1011
1012 // swap 16bit String to 32bit MB
1013 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1014 {
1015     size_t len=0;
1016
1017     while (*psz && (!buf || len < n))
1018     {
1019         char cc[4];
1020
1021         // cast is ok for WC_UTF16
1022         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1023         if (pa == (size_t)-1)
1024             return pa;
1025
1026         if (buf)
1027         {
1028             *buf++ = cc[3];
1029             *buf++ = cc[2];
1030             *buf++ = cc[1];
1031             *buf++ = cc[0];
1032         }
1033         len += sizeof(wxUint32);
1034         psz += pa;
1035     }
1036
1037     if (buf && len<=n-sizeof(wxUint32))
1038         *(wxUint32*)buf=0;
1039
1040     return len;
1041 }
1042
1043 #else // WC_UTF16
1044
1045
1046 // copy 32bit MB to 32bit String
1047 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1048 {
1049     size_t len=0;
1050
1051     while (*(wxUint32*)psz && (!buf || len < n))
1052     {
1053         if (buf)
1054             *buf++ = *(wxUint32*)psz;
1055         len++;
1056         psz += sizeof(wxUint32);
1057     }
1058
1059     if (buf && len<n)
1060         *buf=0;
1061
1062     return len;
1063 }
1064
1065
1066 // copy 32bit String to 32bit MB
1067 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1068 {
1069     size_t len=0;
1070
1071     while (*psz && (!buf || len < n))
1072     {
1073         if (buf)
1074         {
1075             *(wxUint32*)buf = *psz;
1076             buf += sizeof(wxUint32);
1077         }
1078
1079         len += sizeof(wxUint32);
1080         psz++;
1081     }
1082
1083     if (buf && len<=n-sizeof(wxUint32))
1084         *(wxUint32*)buf=0;
1085
1086     return len;
1087 }
1088
1089
1090 // swap 32bit MB to 32bit String
1091 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1092 {
1093     size_t len=0;
1094
1095     while (*(wxUint32*)psz && (!buf || len < n))
1096     {
1097         if (buf)
1098         {
1099             ((char *)buf)[0] = psz[3];
1100             ((char *)buf)[1] = psz[2];
1101             ((char *)buf)[2] = psz[1];
1102             ((char *)buf)[3] = psz[0];
1103             buf++;
1104         }
1105         len++;
1106         psz += sizeof(wxUint32);
1107     }
1108
1109     if (buf && len<n)
1110         *buf=0;
1111
1112     return len;
1113 }
1114
1115
1116 // swap 32bit String to 32bit MB
1117 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1118 {
1119     size_t len=0;
1120
1121     while (*psz && (!buf || len < n))
1122     {
1123         if (buf)
1124         {
1125             *buf++ = ((char *)psz)[3];
1126             *buf++ = ((char *)psz)[2];
1127             *buf++ = ((char *)psz)[1];
1128             *buf++ = ((char *)psz)[0];
1129         }
1130         len += sizeof(wxUint32);
1131         psz++;
1132     }
1133
1134     if (buf && len<=n-sizeof(wxUint32))
1135         *(wxUint32*)buf=0;
1136
1137     return len;
1138 }
1139
1140
1141 #endif // WC_UTF16
1142
1143
1144 // ============================================================================
1145 // The classes doing conversion using the iconv_xxx() functions
1146 // ============================================================================
1147
1148 #ifdef HAVE_ICONV
1149
1150 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1151 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1152 //     (unless there's yet another bug in glibc) the only case when iconv()
1153 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1154 //     left in the input buffer -- when _real_ error occurs,
1155 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1156 //     iconv() failure.
1157 //     [This bug does not appear in glibc 2.2.]
1158 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1159 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1160                                      (errno != E2BIG || bufLeft != 0))
1161 #else
1162 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1163 #endif
1164
1165 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1166
1167 // ----------------------------------------------------------------------------
1168 // wxMBConv_iconv: encapsulates an iconv character set
1169 // ----------------------------------------------------------------------------
1170
1171 class wxMBConv_iconv : public wxMBConv
1172 {
1173 public:
1174     wxMBConv_iconv(const wxChar *name);
1175     virtual ~wxMBConv_iconv();
1176
1177     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1178     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1179
1180     bool IsOk() const
1181         { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1182
1183 protected:
1184     // the iconv handlers used to translate from multibyte to wide char and in
1185     // the other direction
1186     iconv_t m2w,
1187             w2m;
1188 #if wxUSE_THREADS
1189     // guards access to m2w and w2m objects
1190     wxMutex m_iconvMutex;
1191 #endif
1192
1193 private:
1194     // the name (for iconv_open()) of a wide char charset -- if none is
1195     // available on this machine, it will remain NULL
1196     static const char *ms_wcCharsetName;
1197
1198     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1199     // different endian-ness than the native one
1200     static bool ms_wcNeedsSwap;
1201 };
1202
1203 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1204 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1205
1206 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1207 {
1208     // Do it the hard way
1209     char cname[100];
1210     for (size_t i = 0; i < wxStrlen(name)+1; i++)
1211         cname[i] = (char) name[i];
1212
1213     // check for charset that represents wchar_t:
1214     if (ms_wcCharsetName == NULL)
1215     {
1216         ms_wcNeedsSwap = false;
1217
1218         // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1219         ms_wcCharsetName = WC_NAME_BEST;
1220         m2w = iconv_open(ms_wcCharsetName, cname);
1221
1222         if (m2w == (iconv_t)-1)
1223         {
1224             // try charset w/o bytesex info (e.g. "UCS4")
1225             // and check for bytesex ourselves:
1226             ms_wcCharsetName = WC_NAME;
1227             m2w = iconv_open(ms_wcCharsetName, cname);
1228
1229             // last bet, try if it knows WCHAR_T pseudo-charset
1230             if (m2w == (iconv_t)-1)
1231             {
1232                 ms_wcCharsetName = "WCHAR_T";
1233                 m2w = iconv_open(ms_wcCharsetName, cname);
1234             }
1235
1236             if (m2w != (iconv_t)-1)
1237             {
1238                 char    buf[2], *bufPtr;
1239                 wchar_t wbuf[2], *wbufPtr;
1240                 size_t  insz, outsz;
1241                 size_t  res;
1242
1243                 buf[0] = 'A';
1244                 buf[1] = 0;
1245                 wbuf[0] = 0;
1246                 insz = 2;
1247                 outsz = SIZEOF_WCHAR_T * 2;
1248                 wbufPtr = wbuf;
1249                 bufPtr = buf;
1250
1251                 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1252                             (char**)&wbufPtr, &outsz);
1253
1254                 if (ICONV_FAILED(res, insz))
1255                 {
1256                     ms_wcCharsetName = NULL;
1257                     wxLogLastError(wxT("iconv"));
1258                     wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1259                 }
1260                 else
1261                 {
1262                     ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1263                 }
1264             }
1265             else
1266             {
1267                 ms_wcCharsetName = NULL;
1268
1269                 // VS: we must not output an error here, since wxWidgets will safely
1270                 //     fall back to using wxEncodingConverter.
1271                 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1272                 //wxLogError(
1273             }
1274         }
1275         wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1276     }
1277     else // we already have ms_wcCharsetName
1278     {
1279         m2w = iconv_open(ms_wcCharsetName, cname);
1280     }
1281
1282     // NB: don't ever pass NULL to iconv_open(), it may crash!
1283     if ( ms_wcCharsetName )
1284     {
1285         w2m = iconv_open( cname, ms_wcCharsetName);
1286     }
1287     else
1288     {
1289         w2m = (iconv_t)-1;
1290     }
1291 }
1292
1293 wxMBConv_iconv::~wxMBConv_iconv()
1294 {
1295     if ( m2w != (iconv_t)-1 )
1296         iconv_close(m2w);
1297     if ( w2m != (iconv_t)-1 )
1298         iconv_close(w2m);
1299 }
1300
1301 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1302 {
1303 #if wxUSE_THREADS
1304     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1305     //     Unfortunately there is a couple of global wxCSConv objects such as
1306     //     wxConvLocal that are used all over wx code, so we have to make sure
1307     //     the handle is used by at most one thread at the time. Otherwise
1308     //     only a few wx classes would be safe to use from non-main threads
1309     //     as MB<->WC conversion would fail "randomly".
1310     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1311 #endif
1312
1313     size_t inbuf = strlen(psz);
1314     size_t outbuf = n * SIZEOF_WCHAR_T;
1315     size_t res, cres;
1316     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1317     wchar_t *bufPtr = buf;
1318     const char *pszPtr = psz;
1319
1320     if (buf)
1321     {
1322         // have destination buffer, convert there
1323         cres = iconv(m2w,
1324                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1325                      (char**)&bufPtr, &outbuf);
1326         res = n - (outbuf / SIZEOF_WCHAR_T);
1327
1328         if (ms_wcNeedsSwap)
1329         {
1330             // convert to native endianness
1331             WC_BSWAP(buf /* _not_ bufPtr */, res)
1332         }
1333
1334         // NB: iconv was given only strlen(psz) characters on input, and so
1335         //     it couldn't convert the trailing zero. Let's do it ourselves
1336         //     if there's some room left for it in the output buffer.
1337         if (res < n)
1338             buf[res] = 0;
1339     }
1340     else
1341     {
1342         // no destination buffer... convert using temp buffer
1343         // to calculate destination buffer requirement
1344         wchar_t tbuf[8];
1345         res = 0;
1346         do {
1347             bufPtr = tbuf;
1348             outbuf = 8*SIZEOF_WCHAR_T;
1349
1350             cres = iconv(m2w,
1351                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1352                          (char**)&bufPtr, &outbuf );
1353
1354             res += 8-(outbuf/SIZEOF_WCHAR_T);
1355         } while ((cres==(size_t)-1) && (errno==E2BIG));
1356     }
1357
1358     if (ICONV_FAILED(cres, inbuf))
1359     {
1360         //VS: it is ok if iconv fails, hence trace only
1361         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1362         return (size_t)-1;
1363     }
1364
1365     return res;
1366 }
1367
1368 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1369 {
1370 #if wxUSE_THREADS
1371     // NB: explained in MB2WC
1372     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1373 #endif
1374
1375     size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1376     size_t outbuf = n;
1377     size_t res, cres;
1378
1379     wchar_t *tmpbuf = 0;
1380
1381     if (ms_wcNeedsSwap)
1382     {
1383         // need to copy to temp buffer to switch endianness
1384         // this absolutely doesn't rock!
1385         // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1386         //  could be in read-only memory, or be accessed in some other thread)
1387         tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1388         memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1389         WC_BSWAP(tmpbuf, inbuf)
1390         psz=tmpbuf;
1391     }
1392
1393     if (buf)
1394     {
1395         // have destination buffer, convert there
1396         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1397
1398         res = n-outbuf;
1399
1400         // NB: iconv was given only wcslen(psz) characters on input, and so
1401         //     it couldn't convert the trailing zero. Let's do it ourselves
1402         //     if there's some room left for it in the output buffer.
1403         if (res < n)
1404             buf[0] = 0;
1405     }
1406     else
1407     {
1408         // no destination buffer... convert using temp buffer
1409         // to calculate destination buffer requirement
1410         char tbuf[16];
1411         res = 0;
1412         do {
1413             buf = tbuf; outbuf = 16;
1414
1415             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1416
1417             res += 16 - outbuf;
1418         } while ((cres==(size_t)-1) && (errno==E2BIG));
1419     }
1420
1421     if (ms_wcNeedsSwap)
1422     {
1423         free(tmpbuf);
1424     }
1425
1426     if (ICONV_FAILED(cres, inbuf))
1427     {
1428         //VS: it is ok if iconv fails, hence trace only
1429         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1430         return (size_t)-1;
1431     }
1432
1433     return res;
1434 }
1435
1436 #endif // HAVE_ICONV
1437
1438
1439 // ============================================================================
1440 // Win32 conversion classes
1441 // ============================================================================
1442
1443 #ifdef wxHAVE_WIN32_MB2WC
1444
1445 // from utils.cpp
1446 #if wxUSE_FONTMAP
1447 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1448 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1449 #endif
1450
1451 class wxMBConv_win32 : public wxMBConv
1452 {
1453 public:
1454     wxMBConv_win32()
1455     {
1456         m_CodePage = CP_ACP;
1457     }
1458
1459 #if wxUSE_FONTMAP
1460     wxMBConv_win32(const wxChar* name)
1461     {
1462         m_CodePage = wxCharsetToCodepage(name);
1463     }
1464
1465     wxMBConv_win32(wxFontEncoding encoding)
1466     {
1467         m_CodePage = wxEncodingToCodepage(encoding);
1468     }
1469 #endif
1470
1471     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1472     {
1473         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1474         // the behaviour is not compatible with the Unix version (using iconv)
1475         // and break the library itself, e.g. wxTextInputStream::NextChar()
1476         // wouldn't work if reading an incomplete MB char didn't result in an
1477         // error
1478         //
1479         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1480         // an error (tested under Windows Server 2003) and apparently it is
1481         // done on purpose, i.e. the function accepts any input in this case
1482         // and although I'd prefer to return error on ill-formed output, our
1483         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1484         // explicitly ill-formed according to RFC 2152) neither so we don't
1485         // even have any fallback here...
1486         int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1487
1488         const size_t len = ::MultiByteToWideChar
1489                              (
1490                                 m_CodePage,     // code page
1491                                 flags,          // flags: fall on error
1492                                 psz,            // input string
1493                                 -1,             // its length (NUL-terminated)
1494                                 buf,            // output string
1495                                 buf ? n : 0     // size of output buffer
1496                              );
1497
1498         // note that it returns count of written chars for buf != NULL and size
1499         // of the needed buffer for buf == NULL so in either case the length of
1500         // the string (which never includes the terminating NUL) is one less
1501         return len ? len - 1 : (size_t)-1;
1502     }
1503
1504     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1505     {
1506         /*
1507             we have a problem here: by default, WideCharToMultiByte() may
1508             replace characters unrepresentable in the target code page with bad
1509             quality approximations such as turning "1/2" symbol (U+00BD) into
1510             "1" for the code pages which don't have it and we, obviously, want
1511             to avoid this at any price
1512
1513             the trouble is that this function does it _silently_, i.e. it won't
1514             even tell us whether it did or not... Win98/2000 and higher provide
1515             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1516             we have to resort to a round trip, i.e. check that converting back
1517             results in the same string -- this is, of course, expensive but
1518             otherwise we simply can't be sure to not garble the data.
1519          */
1520
1521         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1522         // it doesn't work with CJK encodings (which we test for rather roughly
1523         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1524         // supporting it
1525         BOOL usedDef wxDUMMY_INITIALIZE(false);
1526         BOOL *pUsedDef;
1527         int flags;
1528         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1529         {
1530             // it's our lucky day
1531             flags = WC_NO_BEST_FIT_CHARS;
1532             pUsedDef = &usedDef;
1533         }
1534         else // old system or unsupported encoding
1535         {
1536             flags = 0;
1537             pUsedDef = NULL;
1538         }
1539
1540         const size_t len = ::WideCharToMultiByte
1541                              (
1542                                 m_CodePage,     // code page
1543                                 flags,          // either none or no best fit
1544                                 pwz,            // input string
1545                                 -1,             // it is (wide) NUL-terminated
1546                                 buf,            // output buffer
1547                                 buf ? n : 0,    // and its size
1548                                 NULL,           // default "replacement" char
1549                                 pUsedDef        // [out] was it used?
1550                              );
1551
1552         if ( !len )
1553         {
1554             // function totally failed
1555             return (size_t)-1;
1556         }
1557
1558         // if we were really converting, check if we succeeded
1559         if ( buf )
1560         {
1561             if ( flags )
1562             {
1563                 // check if the conversion failed, i.e. if any replacements
1564                 // were done
1565                 if ( usedDef )
1566                     return (size_t)-1;
1567             }
1568             else // we must resort to double tripping...
1569             {
1570                 wxWCharBuffer wcBuf(n);
1571                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1572                         wcscmp(wcBuf, pwz) != 0 )
1573                 {
1574                     // we didn't obtain the same thing we started from, hence
1575                     // the conversion was lossy and we consider that it failed
1576                     return (size_t)-1;
1577                 }
1578             }
1579         }
1580
1581         // see the comment above for the reason of "len - 1"
1582         return len - 1;
1583     }
1584
1585     bool IsOk() const { return m_CodePage != -1; }
1586
1587 private:
1588     static bool CanUseNoBestFit()
1589     {
1590         static int s_isWin98Or2k = -1;
1591
1592         if ( s_isWin98Or2k == -1 )
1593         {
1594             int verMaj, verMin;
1595             switch ( wxGetOsVersion(&verMaj, &verMin) )
1596             {
1597                 case wxWIN95:
1598                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1599                     break;
1600
1601                 case wxWINDOWS_NT:
1602                     s_isWin98Or2k = verMaj >= 5;
1603                     break;
1604
1605                 default:
1606                     // unknown, be conseravtive by default
1607                     s_isWin98Or2k = 0;
1608             }
1609
1610             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1611         }
1612
1613         return s_isWin98Or2k == 1;
1614     }
1615
1616     long m_CodePage;
1617 };
1618
1619 #endif // wxHAVE_WIN32_MB2WC
1620
1621 // ============================================================================
1622 // Cocoa conversion classes
1623 // ============================================================================
1624
1625 #if defined(__WXCOCOA__)
1626
1627 // RN:  There is no UTF-32 support in either Core Foundation or
1628 // Cocoa.  Strangely enough, internally Core Foundation uses
1629 // UTF 32 internally quite a bit - its just not public (yet).
1630
1631 #include <CoreFoundation/CFString.h>
1632 #include <CoreFoundation/CFStringEncodingExt.h>
1633
1634 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1635 {
1636     CFStringEncoding enc = kCFStringEncodingInvalidId ;
1637     if ( encoding == wxFONTENCODING_DEFAULT )
1638     {
1639         enc = CFStringGetSystemEncoding();
1640     }
1641     else switch( encoding)
1642     {
1643         case wxFONTENCODING_ISO8859_1 :
1644             enc = kCFStringEncodingISOLatin1 ;
1645             break ;
1646         case wxFONTENCODING_ISO8859_2 :
1647             enc = kCFStringEncodingISOLatin2;
1648             break ;
1649         case wxFONTENCODING_ISO8859_3 :
1650             enc = kCFStringEncodingISOLatin3 ;
1651             break ;
1652         case wxFONTENCODING_ISO8859_4 :
1653             enc = kCFStringEncodingISOLatin4;
1654             break ;
1655         case wxFONTENCODING_ISO8859_5 :
1656             enc = kCFStringEncodingISOLatinCyrillic;
1657             break ;
1658         case wxFONTENCODING_ISO8859_6 :
1659             enc = kCFStringEncodingISOLatinArabic;
1660             break ;
1661         case wxFONTENCODING_ISO8859_7 :
1662             enc = kCFStringEncodingISOLatinGreek;
1663             break ;
1664         case wxFONTENCODING_ISO8859_8 :
1665             enc = kCFStringEncodingISOLatinHebrew;
1666             break ;
1667         case wxFONTENCODING_ISO8859_9 :
1668             enc = kCFStringEncodingISOLatin5;
1669             break ;
1670         case wxFONTENCODING_ISO8859_10 :
1671             enc = kCFStringEncodingISOLatin6;
1672             break ;
1673         case wxFONTENCODING_ISO8859_11 :
1674             enc = kCFStringEncodingISOLatinThai;
1675             break ;
1676         case wxFONTENCODING_ISO8859_13 :
1677             enc = kCFStringEncodingISOLatin7;
1678             break ;
1679         case wxFONTENCODING_ISO8859_14 :
1680             enc = kCFStringEncodingISOLatin8;
1681             break ;
1682         case wxFONTENCODING_ISO8859_15 :
1683             enc = kCFStringEncodingISOLatin9;
1684             break ;
1685
1686         case wxFONTENCODING_KOI8 :
1687             enc = kCFStringEncodingKOI8_R;
1688             break ;
1689         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1690             enc = kCFStringEncodingDOSRussian;
1691             break ;
1692
1693 //      case wxFONTENCODING_BULGARIAN :
1694 //          enc = ;
1695 //          break ;
1696
1697         case wxFONTENCODING_CP437 :
1698             enc =kCFStringEncodingDOSLatinUS ;
1699             break ;
1700         case wxFONTENCODING_CP850 :
1701             enc = kCFStringEncodingDOSLatin1;
1702             break ;
1703         case wxFONTENCODING_CP852 :
1704             enc = kCFStringEncodingDOSLatin2;
1705             break ;
1706         case wxFONTENCODING_CP855 :
1707             enc = kCFStringEncodingDOSCyrillic;
1708             break ;
1709         case wxFONTENCODING_CP866 :
1710             enc =kCFStringEncodingDOSRussian ;
1711             break ;
1712         case wxFONTENCODING_CP874 :
1713             enc = kCFStringEncodingDOSThai;
1714             break ;
1715         case wxFONTENCODING_CP932 :
1716             enc = kCFStringEncodingDOSJapanese;
1717             break ;
1718         case wxFONTENCODING_CP936 :
1719             enc =kCFStringEncodingDOSChineseSimplif ;
1720             break ;
1721         case wxFONTENCODING_CP949 :
1722             enc = kCFStringEncodingDOSKorean;
1723             break ;
1724         case wxFONTENCODING_CP950 :
1725             enc = kCFStringEncodingDOSChineseTrad;
1726             break ;
1727         case wxFONTENCODING_CP1250 :
1728             enc = kCFStringEncodingWindowsLatin2;
1729             break ;
1730         case wxFONTENCODING_CP1251 :
1731             enc =kCFStringEncodingWindowsCyrillic ;
1732             break ;
1733         case wxFONTENCODING_CP1252 :
1734             enc =kCFStringEncodingWindowsLatin1 ;
1735             break ;
1736         case wxFONTENCODING_CP1253 :
1737             enc = kCFStringEncodingWindowsGreek;
1738             break ;
1739         case wxFONTENCODING_CP1254 :
1740             enc = kCFStringEncodingWindowsLatin5;
1741             break ;
1742         case wxFONTENCODING_CP1255 :
1743             enc =kCFStringEncodingWindowsHebrew ;
1744             break ;
1745         case wxFONTENCODING_CP1256 :
1746             enc =kCFStringEncodingWindowsArabic ;
1747             break ;
1748         case wxFONTENCODING_CP1257 :
1749             enc = kCFStringEncodingWindowsBalticRim;
1750             break ;
1751 //   This only really encodes to UTF7 (if that) evidently
1752 //        case wxFONTENCODING_UTF7 :
1753 //            enc = kCFStringEncodingNonLossyASCII ;
1754 //            break ;
1755         case wxFONTENCODING_UTF8 :
1756             enc = kCFStringEncodingUTF8 ;
1757             break ;
1758         case wxFONTENCODING_EUC_JP :
1759             enc = kCFStringEncodingEUC_JP;
1760             break ;
1761         case wxFONTENCODING_UTF16 :
1762             enc = kCFStringEncodingUnicode ;
1763             break ;
1764         case wxFONTENCODING_MACROMAN :
1765             enc = kCFStringEncodingMacRoman ;
1766             break ;
1767         case wxFONTENCODING_MACJAPANESE :
1768             enc = kCFStringEncodingMacJapanese ;
1769             break ;
1770         case wxFONTENCODING_MACCHINESETRAD :
1771             enc = kCFStringEncodingMacChineseTrad ;
1772             break ;
1773         case wxFONTENCODING_MACKOREAN :
1774             enc = kCFStringEncodingMacKorean ;
1775             break ;
1776         case wxFONTENCODING_MACARABIC :
1777             enc = kCFStringEncodingMacArabic ;
1778             break ;
1779         case wxFONTENCODING_MACHEBREW :
1780             enc = kCFStringEncodingMacHebrew ;
1781             break ;
1782         case wxFONTENCODING_MACGREEK :
1783             enc = kCFStringEncodingMacGreek ;
1784             break ;
1785         case wxFONTENCODING_MACCYRILLIC :
1786             enc = kCFStringEncodingMacCyrillic ;
1787             break ;
1788         case wxFONTENCODING_MACDEVANAGARI :
1789             enc = kCFStringEncodingMacDevanagari ;
1790             break ;
1791         case wxFONTENCODING_MACGURMUKHI :
1792             enc = kCFStringEncodingMacGurmukhi ;
1793             break ;
1794         case wxFONTENCODING_MACGUJARATI :
1795             enc = kCFStringEncodingMacGujarati ;
1796             break ;
1797         case wxFONTENCODING_MACORIYA :
1798             enc = kCFStringEncodingMacOriya ;
1799             break ;
1800         case wxFONTENCODING_MACBENGALI :
1801             enc = kCFStringEncodingMacBengali ;
1802             break ;
1803         case wxFONTENCODING_MACTAMIL :
1804             enc = kCFStringEncodingMacTamil ;
1805             break ;
1806         case wxFONTENCODING_MACTELUGU :
1807             enc = kCFStringEncodingMacTelugu ;
1808             break ;
1809         case wxFONTENCODING_MACKANNADA :
1810             enc = kCFStringEncodingMacKannada ;
1811             break ;
1812         case wxFONTENCODING_MACMALAJALAM :
1813             enc = kCFStringEncodingMacMalayalam ;
1814             break ;
1815         case wxFONTENCODING_MACSINHALESE :
1816             enc = kCFStringEncodingMacSinhalese ;
1817             break ;
1818         case wxFONTENCODING_MACBURMESE :
1819             enc = kCFStringEncodingMacBurmese ;
1820             break ;
1821         case wxFONTENCODING_MACKHMER :
1822             enc = kCFStringEncodingMacKhmer ;
1823             break ;
1824         case wxFONTENCODING_MACTHAI :
1825             enc = kCFStringEncodingMacThai ;
1826             break ;
1827         case wxFONTENCODING_MACLAOTIAN :
1828             enc = kCFStringEncodingMacLaotian ;
1829             break ;
1830         case wxFONTENCODING_MACGEORGIAN :
1831             enc = kCFStringEncodingMacGeorgian ;
1832             break ;
1833         case wxFONTENCODING_MACARMENIAN :
1834             enc = kCFStringEncodingMacArmenian ;
1835             break ;
1836         case wxFONTENCODING_MACCHINESESIMP :
1837             enc = kCFStringEncodingMacChineseSimp ;
1838             break ;
1839         case wxFONTENCODING_MACTIBETAN :
1840             enc = kCFStringEncodingMacTibetan ;
1841             break ;
1842         case wxFONTENCODING_MACMONGOLIAN :
1843             enc = kCFStringEncodingMacMongolian ;
1844             break ;
1845         case wxFONTENCODING_MACETHIOPIC :
1846             enc = kCFStringEncodingMacEthiopic ;
1847             break ;
1848         case wxFONTENCODING_MACCENTRALEUR :
1849             enc = kCFStringEncodingMacCentralEurRoman ;
1850             break ;
1851         case wxFONTENCODING_MACVIATNAMESE :
1852             enc = kCFStringEncodingMacVietnamese ;
1853             break ;
1854         case wxFONTENCODING_MACARABICEXT :
1855             enc = kCFStringEncodingMacExtArabic ;
1856             break ;
1857         case wxFONTENCODING_MACSYMBOL :
1858             enc = kCFStringEncodingMacSymbol ;
1859             break ;
1860         case wxFONTENCODING_MACDINGBATS :
1861             enc = kCFStringEncodingMacDingbats ;
1862             break ;
1863         case wxFONTENCODING_MACTURKISH :
1864             enc = kCFStringEncodingMacTurkish ;
1865             break ;
1866         case wxFONTENCODING_MACCROATIAN :
1867             enc = kCFStringEncodingMacCroatian ;
1868             break ;
1869         case wxFONTENCODING_MACICELANDIC :
1870             enc = kCFStringEncodingMacIcelandic ;
1871             break ;
1872         case wxFONTENCODING_MACROMANIAN :
1873             enc = kCFStringEncodingMacRomanian ;
1874             break ;
1875         case wxFONTENCODING_MACCELTIC :
1876             enc = kCFStringEncodingMacCeltic ;
1877             break ;
1878         case wxFONTENCODING_MACGAELIC :
1879             enc = kCFStringEncodingMacGaelic ;
1880             break ;
1881 //      case wxFONTENCODING_MACKEYBOARD :
1882 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
1883 //          break ;
1884         default :
1885             // because gcc is picky
1886             break ;
1887     } ;
1888     return enc ;
1889 }
1890
1891 class wxMBConv_cocoa : public wxMBConv
1892 {
1893 public:
1894     wxMBConv_cocoa()
1895     {
1896         Init(CFStringGetSystemEncoding()) ;
1897     }
1898
1899 #if wxUSE_FONTMAP
1900     wxMBConv_cocoa(const wxChar* name)
1901     {
1902         Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
1903     }
1904 #endif
1905
1906     wxMBConv_cocoa(wxFontEncoding encoding)
1907     {
1908         Init( wxCFStringEncFromFontEnc(encoding) );
1909     }
1910
1911     ~wxMBConv_cocoa()
1912     {
1913     }
1914
1915     void Init( CFStringEncoding encoding)
1916     {
1917         m_encoding = encoding ;
1918     }
1919
1920     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1921     {
1922         wxASSERT(szUnConv);
1923
1924         CFStringRef theString = CFStringCreateWithBytes (
1925                                                 NULL, //the allocator
1926                                                 (const UInt8*)szUnConv,
1927                                                 strlen(szUnConv),
1928                                                 m_encoding,
1929                                                 false //no BOM/external representation
1930                                                 );
1931
1932         wxASSERT(theString);
1933
1934         size_t nOutLength = CFStringGetLength(theString);
1935
1936         if (szOut == NULL)
1937         {
1938             CFRelease(theString);
1939             return nOutLength;
1940         }
1941
1942         CFRange theRange = { 0, nOutSize };
1943
1944 #if SIZEOF_WCHAR_T == 4
1945         UniChar* szUniCharBuffer = new UniChar[nOutSize];
1946 #endif
1947
1948         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
1949
1950         CFRelease(theString);
1951
1952         szUniCharBuffer[nOutLength] = '\0' ;
1953
1954 #if SIZEOF_WCHAR_T == 4
1955         wxMBConvUTF16 converter ;
1956         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
1957         delete[] szUniCharBuffer;
1958 #endif
1959
1960         return nOutLength;
1961     }
1962
1963     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1964     {
1965         wxASSERT(szUnConv);
1966
1967         size_t nRealOutSize;
1968         size_t nBufSize = wxWcslen(szUnConv);
1969         UniChar* szUniBuffer = (UniChar*) szUnConv;
1970
1971 #if SIZEOF_WCHAR_T == 4
1972         wxMBConvUTF16BE converter ;
1973         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
1974         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
1975         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
1976         nBufSize /= sizeof(UniChar);
1977 #endif
1978
1979         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
1980                                 NULL, //allocator
1981                                 szUniBuffer,
1982                                 nBufSize,
1983                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
1984                             );
1985
1986         wxASSERT(theString);
1987
1988         //Note that CER puts a BOM when converting to unicode
1989         //so we  check and use getchars instead in that case
1990         if (m_encoding == kCFStringEncodingUnicode)
1991         {
1992             if (szOut != NULL)
1993                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
1994
1995             nRealOutSize = CFStringGetLength(theString) + 1;
1996         }
1997         else
1998         {
1999             CFStringGetBytes(
2000                 theString,
2001                 CFRangeMake(0, CFStringGetLength(theString)),
2002                 m_encoding,
2003                 0, //what to put in characters that can't be converted -
2004                     //0 tells CFString to return NULL if it meets such a character
2005                 false, //not an external representation
2006                 (UInt8*) szOut,
2007                 nOutSize,
2008                 (CFIndex*) &nRealOutSize
2009                         );
2010         }
2011
2012         CFRelease(theString);
2013
2014 #if SIZEOF_WCHAR_T == 4
2015         delete[] szUniBuffer;
2016 #endif
2017
2018         return  nRealOutSize - 1;
2019     }
2020
2021     bool IsOk() const
2022     {
2023         return m_encoding != kCFStringEncodingInvalidId &&
2024               CFStringIsEncodingAvailable(m_encoding);
2025     }
2026
2027 private:
2028     CFStringEncoding m_encoding ;
2029 };
2030
2031 #endif // defined(__WXCOCOA__)
2032
2033 // ============================================================================
2034 // Mac conversion classes
2035 // ============================================================================
2036
2037 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2038
2039 class wxMBConv_mac : public wxMBConv
2040 {
2041 public:
2042     wxMBConv_mac()
2043     {
2044         Init(CFStringGetSystemEncoding()) ;
2045     }
2046
2047 #if wxUSE_FONTMAP
2048     wxMBConv_mac(const wxChar* name)
2049     {
2050         Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
2051     }
2052 #endif
2053
2054     wxMBConv_mac(wxFontEncoding encoding)
2055     {
2056         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2057     }
2058
2059     ~wxMBConv_mac()
2060     {
2061         OSStatus status = noErr ;
2062         status = TECDisposeConverter(m_MB2WC_converter);
2063         status = TECDisposeConverter(m_WC2MB_converter);
2064     }
2065
2066
2067     void Init( TextEncodingBase encoding)
2068     {
2069         OSStatus status = noErr ;
2070         m_char_encoding = encoding ;
2071         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2072
2073         status = TECCreateConverter(&m_MB2WC_converter,
2074                                     m_char_encoding,
2075                                     m_unicode_encoding);
2076         status = TECCreateConverter(&m_WC2MB_converter,
2077                                     m_unicode_encoding,
2078                                     m_char_encoding);
2079     }
2080
2081     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2082     {
2083         OSStatus status = noErr ;
2084         ByteCount byteOutLen ;
2085         ByteCount byteInLen = strlen(psz) ;
2086         wchar_t *tbuf = NULL ;
2087         UniChar* ubuf = NULL ;
2088         size_t res = 0 ;
2089
2090         if (buf == NULL)
2091         {
2092             //apple specs say at least 32
2093             n = wxMax( 32 , byteInLen ) ;
2094             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2095         }
2096         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2097 #if SIZEOF_WCHAR_T == 4
2098         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2099 #else
2100         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2101 #endif
2102         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2103           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2104 #if SIZEOF_WCHAR_T == 4
2105         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2106         // is not properly terminated we get random characters at the end
2107         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2108         wxMBConvUTF16BE converter ;
2109         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2110         free( ubuf ) ;
2111 #else
2112         res = byteOutLen / sizeof( UniChar ) ;
2113 #endif
2114         if ( buf == NULL )
2115              free(tbuf) ;
2116
2117         if ( buf  && res < n)
2118             buf[res] = 0;
2119
2120         return res ;
2121     }
2122
2123     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2124     {
2125         OSStatus status = noErr ;
2126         ByteCount byteOutLen ;
2127         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2128
2129         char *tbuf = NULL ;
2130
2131         if (buf == NULL)
2132         {
2133             //apple specs say at least 32
2134             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2135             tbuf = (char*) malloc( n ) ;
2136         }
2137
2138         ByteCount byteBufferLen = n ;
2139         UniChar* ubuf = NULL ;
2140 #if SIZEOF_WCHAR_T == 4
2141         wxMBConvUTF16BE converter ;
2142         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2143         byteInLen = unicharlen ;
2144         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2145         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2146 #else
2147         ubuf = (UniChar*) psz ;
2148 #endif
2149         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2150             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2151 #if SIZEOF_WCHAR_T == 4
2152         free( ubuf ) ;
2153 #endif
2154         if ( buf == NULL )
2155             free(tbuf) ;
2156
2157         size_t res = byteOutLen ;
2158         if ( buf  && res < n)
2159         {
2160             buf[res] = 0;
2161
2162             //we need to double-trip to verify it didn't insert any ? in place
2163             //of bogus characters
2164             wxWCharBuffer wcBuf(n);
2165             size_t pszlen = wxWcslen(psz);
2166             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2167                         wxWcslen(wcBuf) != pszlen ||
2168                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2169             {
2170                 // we didn't obtain the same thing we started from, hence
2171                 // the conversion was lossy and we consider that it failed
2172                 return (size_t)-1;
2173             }
2174         }
2175
2176         return res ;
2177     }
2178
2179     bool IsOk() const
2180         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2181
2182 private:
2183     TECObjectRef m_MB2WC_converter ;
2184     TECObjectRef m_WC2MB_converter ;
2185
2186     TextEncodingBase m_char_encoding ;
2187     TextEncodingBase m_unicode_encoding ;
2188 };
2189
2190 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2191
2192 // ============================================================================
2193 // wxEncodingConverter based conversion classes
2194 // ============================================================================
2195
2196 #if wxUSE_FONTMAP
2197
2198 class wxMBConv_wxwin : public wxMBConv
2199 {
2200 private:
2201     void Init()
2202     {
2203         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2204                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2205     }
2206
2207 public:
2208     // temporarily just use wxEncodingConverter stuff,
2209     // so that it works while a better implementation is built
2210     wxMBConv_wxwin(const wxChar* name)
2211     {
2212         if (name)
2213             m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
2214         else
2215             m_enc = wxFONTENCODING_SYSTEM;
2216
2217         Init();
2218     }
2219
2220     wxMBConv_wxwin(wxFontEncoding enc)
2221     {
2222         m_enc = enc;
2223
2224         Init();
2225     }
2226
2227     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2228     {
2229         size_t inbuf = strlen(psz);
2230         if (buf)
2231         {
2232             if (!m2w.Convert(psz,buf))
2233                 return (size_t)-1;
2234         }
2235         return inbuf;
2236     }
2237
2238     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2239     {
2240         const size_t inbuf = wxWcslen(psz);
2241         if (buf)
2242         {
2243             if (!w2m.Convert(psz,buf))
2244                 return (size_t)-1;
2245         }
2246
2247         return inbuf;
2248     }
2249
2250     bool IsOk() const { return m_ok; }
2251
2252 public:
2253     wxFontEncoding m_enc;
2254     wxEncodingConverter m2w, w2m;
2255
2256     // were we initialized successfully?
2257     bool m_ok;
2258
2259     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2260 };
2261
2262 #endif // wxUSE_FONTMAP
2263
2264 // ============================================================================
2265 // wxCSConv implementation
2266 // ============================================================================
2267
2268 void wxCSConv::Init()
2269 {
2270     m_name = NULL;
2271     m_convReal =  NULL;
2272     m_deferred = true;
2273 }
2274
2275 wxCSConv::wxCSConv(const wxChar *charset)
2276 {
2277     Init();
2278
2279     if ( charset )
2280     {
2281         SetName(charset);
2282     }
2283
2284     m_encoding = wxFONTENCODING_SYSTEM;
2285 }
2286
2287 wxCSConv::wxCSConv(wxFontEncoding encoding)
2288 {
2289     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2290     {
2291         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2292
2293         encoding = wxFONTENCODING_SYSTEM;
2294     }
2295
2296     Init();
2297
2298     m_encoding = encoding;
2299 }
2300
2301 wxCSConv::~wxCSConv()
2302 {
2303     Clear();
2304 }
2305
2306 wxCSConv::wxCSConv(const wxCSConv& conv)
2307         : wxMBConv()
2308 {
2309     Init();
2310
2311     SetName(conv.m_name);
2312     m_encoding = conv.m_encoding;
2313 }
2314
2315 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2316 {
2317     Clear();
2318
2319     SetName(conv.m_name);
2320     m_encoding = conv.m_encoding;
2321
2322     return *this;
2323 }
2324
2325 void wxCSConv::Clear()
2326 {
2327     free(m_name);
2328     delete m_convReal;
2329
2330     m_name = NULL;
2331     m_convReal = NULL;
2332 }
2333
2334 void wxCSConv::SetName(const wxChar *charset)
2335 {
2336     if (charset)
2337     {
2338         m_name = wxStrdup(charset);
2339         m_deferred = true;
2340     }
2341 }
2342
2343 wxMBConv *wxCSConv::DoCreate() const
2344 {
2345     // check for the special case of ASCII or ISO8859-1 charset: as we have
2346     // special knowledge of it anyhow, we don't need to create a special
2347     // conversion object
2348     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2349     {
2350         // don't convert at all
2351         return NULL;
2352     }
2353
2354     // we trust OS to do conversion better than we can so try external
2355     // conversion methods first
2356     //
2357     // the full order is:
2358     //      1. OS conversion (iconv() under Unix or Win32 API)
2359     //      2. hard coded conversions for UTF
2360     //      3. wxEncodingConverter as fall back
2361
2362     // step (1)
2363 #ifdef HAVE_ICONV
2364 #if !wxUSE_FONTMAP
2365     if ( m_name )
2366 #endif // !wxUSE_FONTMAP
2367     {
2368         wxString name(m_name);
2369
2370 #if wxUSE_FONTMAP
2371         if ( name.empty() )
2372             name = wxFontMapper::Get()->GetEncodingName(m_encoding);
2373 #endif // wxUSE_FONTMAP
2374
2375         wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2376         if ( conv->IsOk() )
2377             return conv;
2378
2379         delete conv;
2380     }
2381 #endif // HAVE_ICONV
2382
2383 #ifdef wxHAVE_WIN32_MB2WC
2384     {
2385 #if wxUSE_FONTMAP
2386         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2387                                       : new wxMBConv_win32(m_encoding);
2388         if ( conv->IsOk() )
2389             return conv;
2390
2391         delete conv;
2392 #else
2393         return NULL;
2394 #endif
2395     }
2396 #endif // wxHAVE_WIN32_MB2WC
2397 #if defined(__WXMAC__)
2398     {
2399         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
2400         {
2401
2402 #if wxUSE_FONTMAP
2403             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2404                                         : new wxMBConv_mac(m_encoding);
2405 #else
2406             wxMBConv_mac *conv = new wxMBConv_mac(m_encoding);
2407 #endif
2408             if ( conv->IsOk() )
2409                  return conv;
2410
2411             delete conv;
2412         }
2413     }
2414 #endif
2415 #if defined(__WXCOCOA__)
2416     {
2417         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2418         {
2419
2420 #if wxUSE_FONTMAP
2421             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2422                                           : new wxMBConv_cocoa(m_encoding);
2423 #else
2424             wxMBConv_cocoa *conv = new wxMBConv_cocoa(m_encoding);
2425 #endif
2426             if ( conv->IsOk() )
2427                  return conv;
2428
2429             delete conv;
2430         }
2431     }
2432 #endif
2433     // step (2)
2434     wxFontEncoding enc = m_encoding;
2435 #if wxUSE_FONTMAP
2436     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2437     {
2438         // use "false" to suppress interactive dialogs -- we can be called from
2439         // anywhere and popping up a dialog from here is the last thing we want to
2440         // do
2441         enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
2442     }
2443 #endif // wxUSE_FONTMAP
2444
2445     switch ( enc )
2446     {
2447         case wxFONTENCODING_UTF7:
2448              return new wxMBConvUTF7;
2449
2450         case wxFONTENCODING_UTF8:
2451              return new wxMBConvUTF8;
2452
2453         case wxFONTENCODING_UTF16BE:
2454              return new wxMBConvUTF16BE;
2455
2456         case wxFONTENCODING_UTF16LE:
2457              return new wxMBConvUTF16LE;
2458
2459         case wxFONTENCODING_UTF32BE:
2460              return new wxMBConvUTF32BE;
2461
2462         case wxFONTENCODING_UTF32LE:
2463              return new wxMBConvUTF32LE;
2464
2465         default:
2466              // nothing to do but put here to suppress gcc warnings
2467              ;
2468     }
2469
2470     // step (3)
2471 #if wxUSE_FONTMAP
2472     {
2473         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2474                                       : new wxMBConv_wxwin(m_encoding);
2475         if ( conv->IsOk() )
2476             return conv;
2477
2478         delete conv;
2479     }
2480 #endif // wxUSE_FONTMAP
2481
2482     // NB: This is a hack to prevent deadlock. What could otherwise happen
2483     //     in Unicode build: wxConvLocal creation ends up being here
2484     //     because of some failure and logs the error. But wxLog will try to
2485     //     attach timestamp, for which it will need wxConvLocal (to convert
2486     //     time to char* and then wchar_t*), but that fails, tries to log
2487     //     error, but wxLog has a (already locked) critical section that
2488     //     guards static buffer.
2489     static bool alreadyLoggingError = false;
2490     if (!alreadyLoggingError)
2491     {
2492         alreadyLoggingError = true;
2493         wxLogError(_("Cannot convert from the charset '%s'!"),
2494                    m_name ? m_name
2495                       :
2496 #if wxUSE_FONTMAP
2497                          wxFontMapper::GetEncodingDescription(m_encoding).c_str()
2498 #else // !wxUSE_FONTMAP
2499                          wxString::Format(_("encoding %s"), m_encoding).c_str()
2500 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2501               );
2502         alreadyLoggingError = false;
2503     }
2504
2505     return NULL;
2506 }
2507
2508 void wxCSConv::CreateConvIfNeeded() const
2509 {
2510     if ( m_deferred )
2511     {
2512         wxCSConv *self = (wxCSConv *)this; // const_cast
2513
2514 #if wxUSE_INTL
2515         // if we don't have neither the name nor the encoding, use the default
2516         // encoding for this system
2517         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2518         {
2519             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2520         }
2521 #endif // wxUSE_INTL
2522
2523         self->m_convReal = DoCreate();
2524         self->m_deferred = false;
2525     }
2526 }
2527
2528 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2529 {
2530     CreateConvIfNeeded();
2531
2532     if (m_convReal)
2533         return m_convReal->MB2WC(buf, psz, n);
2534
2535     // latin-1 (direct)
2536     size_t len = strlen(psz);
2537
2538     if (buf)
2539     {
2540         for (size_t c = 0; c <= len; c++)
2541             buf[c] = (unsigned char)(psz[c]);
2542     }
2543
2544     return len;
2545 }
2546
2547 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2548 {
2549     CreateConvIfNeeded();
2550
2551     if (m_convReal)
2552         return m_convReal->WC2MB(buf, psz, n);
2553
2554     // latin-1 (direct)
2555     const size_t len = wxWcslen(psz);
2556     if (buf)
2557     {
2558         for (size_t c = 0; c <= len; c++)
2559         {
2560             if (psz[c] > 0xFF)
2561                 return (size_t)-1;
2562             buf[c] = (char)psz[c];
2563         }
2564     }
2565     else
2566     {
2567         for (size_t c = 0; c <= len; c++)
2568         {
2569             if (psz[c] > 0xFF)
2570                 return (size_t)-1;
2571         }
2572     }
2573
2574     return len;
2575 }
2576
2577 // ----------------------------------------------------------------------------
2578 // globals
2579 // ----------------------------------------------------------------------------
2580
2581 #ifdef __WINDOWS__
2582     static wxMBConv_win32 wxConvLibcObj;
2583 #elif defined(__WXMAC__) && !defined(__MACH__)
2584     static wxMBConv_mac wxConvLibcObj ;
2585 #else
2586     static wxMBConvLibc wxConvLibcObj;
2587 #endif
2588
2589 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2590 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2591 static wxMBConvUTF7 wxConvUTF7Obj;
2592 static wxMBConvUTF8 wxConvUTF8Obj;
2593
2594
2595 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2596 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2597 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2598 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2599 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2600 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2601
2602 #else // !wxUSE_WCHAR_T
2603
2604 // stand-ins in absence of wchar_t
2605 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2606                                 wxConvISO8859_1,
2607                                 wxConvLocal,
2608                                 wxConvUTF8;
2609
2610 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2611
2612