src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
  24   #pragma implementation "strconv.h"
  25 #endif
  26
  27 // For compilers that support precompilation, includes "wx.h".
  28 #include "wx/wxprec.h"
  29
  30 #ifdef __BORLANDC__
  31   #pragma hdrstop
  32 #endif
  33
  34 #ifndef WX_PRECOMP
  35     #include "wx/intl.h"
  36     #include "wx/log.h"
  37 #endif // WX_PRECOMP
  38
  39 #include "wx/strconv.h"
  40
  41 #if wxUSE_WCHAR_T
  42
  43 #ifdef __WXMSW__
  44     #include "wx/msw/private.h"
  45 #endif
  46
  47 #ifdef __WINDOWS__
  48     #include "wx/msw/missing.h"
  49 #endif
  50
  51 #ifndef __WXWINCE__
  52 #include <errno.h>
  53 #endif
  54
  55 #include <ctype.h>
  56 #include <string.h>
  57 #include <stdlib.h>
  58
  59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  60     #define wxHAVE_WIN32_MB2WC
  61 #endif // __WIN32__ but !__WXMICROWIN__
  62
  63 // ----------------------------------------------------------------------------
  64 // headers
  65 // ----------------------------------------------------------------------------
  66
  67 #ifdef __SALFORDC__
  68     #include <clib.h>
  69 #endif
  70
  71 #ifdef HAVE_ICONV
  72     #include <iconv.h>
  73     #include "wx/thread.h"
  74 #endif
  75
  76 #include "wx/encconv.h"
  77 #include "wx/fontmap.h"
  78 #include "wx/utils.h"
  79
  80 #ifdef __WXMAC__
  81 #include <ATSUnicode.h>
  82 #include <TextCommon.h>
  83 #include <TextEncodingConverter.h>
  84
  85 #include  "wx/mac/private.h"  // includes mac headers
  86 #endif
  87 // ----------------------------------------------------------------------------
  88 // macros
  89 // ----------------------------------------------------------------------------
  90
  91 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
  92 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
  93
  94 #if SIZEOF_WCHAR_T == 4
  95     #define WC_NAME         "UCS4"
  96     #define WC_BSWAP         BSWAP_UCS4
  97     #ifdef WORDS_BIGENDIAN
  98       #define WC_NAME_BEST  "UCS-4BE"
  99     #else
 100       #define WC_NAME_BEST  "UCS-4LE"
 101     #endif
 102 #elif SIZEOF_WCHAR_T == 2
 103     #define WC_NAME         "UTF16"
 104     #define WC_BSWAP         BSWAP_UTF16
 105     #define WC_UTF16
 106     #ifdef WORDS_BIGENDIAN
 107       #define WC_NAME_BEST  "UTF-16BE"
 108     #else
 109       #define WC_NAME_BEST  "UTF-16LE"
 110     #endif
 111 #else // sizeof(wchar_t) != 2 nor 4
 112     // does this ever happen?
 113     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
 114 #endif
 115
 116 // ============================================================================
 117 // implementation
 118 // ============================================================================
 119
 120 // ----------------------------------------------------------------------------
 121 // UTF-16 en/decoding to/from UCS-4
 122 // ----------------------------------------------------------------------------
 123
 124
 125 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 126 {
 127     if (input<=0xffff)
 128     {
 129         if (output)
 130             *output = (wxUint16) input;
 131         return 1;
 132     }
 133     else if (input>=0x110000)
 134     {
 135         return (size_t)-1;
 136     }
 137     else
 138     {
 139         if (output)
 140         {
 141             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 142             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 143         }
 144         return 2;
 145     }
 146 }
 147
 148 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 149 {
 150     if ((*input<0xd800) || (*input>0xdfff))
 151     {
 152         output = *input;
 153         return 1;
 154     }
 155     else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
 156     {
 157         output = *input;
 158         return (size_t)-1;
 159     }
 160     else
 161     {
 162         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 163         return 2;
 164     }
 165 }
 166
 167
 168 // ----------------------------------------------------------------------------
 169 // wxMBConv
 170 // ----------------------------------------------------------------------------
 171
 172 wxMBConv::~wxMBConv()
 173 {
 174     // nothing to do here (necessary for Darwin linking probably)
 175 }
 176
 177 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 178 {
 179     if ( psz )
 180     {
 181         // calculate the length of the buffer needed first
 182         size_t nLen = MB2WC(NULL, psz, 0);
 183         if ( nLen != (size_t)-1 )
 184         {
 185             // now do the actual conversion
 186             wxWCharBuffer buf(nLen);
 187             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 188             if ( nLen != (size_t)-1 )
 189             {
 190                 return buf;
 191             }
 192         }
 193     }
 194
 195     wxWCharBuffer buf((wchar_t *)NULL);
 196
 197     return buf;
 198 }
 199
 200 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 201 {
 202     if ( pwz )
 203     {
 204         size_t nLen = WC2MB(NULL, pwz, 0);
 205         if ( nLen != (size_t)-1 )
 206         {
 207             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 208             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 209             if ( nLen != (size_t)-1 )
 210             {
 211                 return buf;
 212             }
 213         }
 214     }
 215
 216     wxCharBuffer buf((char *)NULL);
 217
 218     return buf;
 219 }
 220
 221 const wxWCharBuffer wxMBConv::cMB2WC(const char *szString, size_t nStringLen, size_t* pOutSize) const
 222 {
 223     wxASSERT(pOutSize != NULL);
 224
 225     const char* szEnd = szString + nStringLen + 1;
 226     const char* szPos = szString;
 227     const char* szStart = szPos;
 228
 229     size_t nActualLength = 0;
 230     size_t nCurrentSize = nStringLen; //try normal size first (should never resize?)
 231
 232     wxWCharBuffer theBuffer(nCurrentSize);
 233
 234     //Convert the string until the length() is reached, continuing the
 235     //loop every time a null character is reached
 236     while(szPos != szEnd)
 237     {
 238         wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
 239
 240         //Get the length of the current (sub)string
 241         size_t nLen = MB2WC(NULL, szPos, 0);
 242
 243         //Invalid conversion?
 244         if( nLen == (size_t)-1 )
 245         {
 246             *pOutSize = 0;
 247             theBuffer.data()[0u] = wxT('\0');
 248             return theBuffer;
 249         }
 250
 251
 252         //Increase the actual length (+1 for current null character)
 253         nActualLength += nLen + 1;
 254
 255         //if buffer too big, realloc the buffer
 256         if (nActualLength > (nCurrentSize+1))
 257         {
 258             wxWCharBuffer theNewBuffer(nCurrentSize << 1);
 259             memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize * sizeof(wchar_t));
 260             theBuffer = theNewBuffer;
 261             nCurrentSize <<= 1;
 262         }
 263
 264         //Convert the current (sub)string
 265         if ( MB2WC(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
 266         {
 267             *pOutSize = 0;
 268             theBuffer.data()[0u] = wxT('\0');
 269             return theBuffer;
 270         }
 271
 272         //Increment to next (sub)string
 273         //Note that we have to use strlen here instead of nLen
 274         //here because XX2XX gives us the size of the output buffer,
 275         //not neccessarly the length of the string
 276         szPos += strlen(szPos) + 1;
 277     }
 278
 279     //success - return actual length and the buffer
 280     *pOutSize = nActualLength;
 281     return theBuffer;
 282 }
 283
 284 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *szString, size_t nStringLen, size_t* pOutSize) const
 285 {
 286     wxASSERT(pOutSize != NULL);
 287
 288     const wchar_t* szEnd = szString + nStringLen + 1;
 289     const wchar_t* szPos = szString;
 290     const wchar_t* szStart = szPos;
 291
 292     size_t nActualLength = 0;
 293     size_t nCurrentSize = nStringLen << 2; //try * 4 first
 294
 295     wxCharBuffer theBuffer(nCurrentSize);
 296
 297     //Convert the string until the length() is reached, continuing the
 298     //loop every time a null character is reached
 299     while(szPos != szEnd)
 300     {
 301         wxASSERT(szPos < szEnd); //something is _really_ screwed up if this rings true
 302
 303         //Get the length of the current (sub)string
 304         size_t nLen = WC2MB(NULL, szPos, 0);
 305
 306         //Invalid conversion?
 307         if( nLen == (size_t)-1 )
 308         {
 309             *pOutSize = 0;
 310             theBuffer.data()[0u] = wxT('\0');
 311             return theBuffer;
 312         }
 313
 314         //Increase the actual length (+1 for current null character)
 315         nActualLength += nLen + 1;
 316
 317         //if buffer too big, realloc the buffer
 318         if (nActualLength > (nCurrentSize+1))
 319         {
 320             wxCharBuffer theNewBuffer(nCurrentSize << 1);
 321             memcpy(theNewBuffer.data(), theBuffer.data(), nCurrentSize);
 322             theBuffer = theNewBuffer;
 323             nCurrentSize <<= 1;
 324         }
 325
 326         //Convert the current (sub)string
 327         if(WC2MB(&theBuffer.data()[szPos - szStart], szPos, nLen + 1) == (size_t)-1 )
 328         {
 329             *pOutSize = 0;
 330             theBuffer.data()[0u] = wxT('\0');
 331             return theBuffer;
 332         }
 333
 334         //Increment to next (sub)string
 335         //Note that we have to use wxWcslen here instead of nLen
 336         //here because XX2XX gives us the size of the output buffer,
 337         //not neccessarly the length of the string
 338         szPos += wxWcslen(szPos) + 1;
 339     }
 340
 341     //success - return actual length and the buffer
 342     *pOutSize = nActualLength;
 343     return theBuffer;
 344 }
 345
 346 // ----------------------------------------------------------------------------
 347 // wxMBConvLibc
 348 // ----------------------------------------------------------------------------
 349
 350 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 351 {
 352     return wxMB2WC(buf, psz, n);
 353 }
 354
 355 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 356 {
 357     return wxWC2MB(buf, psz, n);
 358 }
 359 // ----------------------------------------------------------------------------
 360 // UTF-7
 361 // ----------------------------------------------------------------------------
 362
 363 // Implementation (C) 2004 Fredrik Roubert
 364
 365 //
 366 // BASE64 decoding table
 367 //
 368 static const unsigned char utf7unb64[] =
 369 {
 370     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 371     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 372     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 373     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 374     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 375     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 376     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 377     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 378     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 379     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 380     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 381     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 382     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 383     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 384     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 385     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 386     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 387     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 388     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 389     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 390     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 391     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 392     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 393     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 394     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 395     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 396     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 397     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 398     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 399     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 400     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 401     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 402 };
 403
 404 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 405 {
 406     size_t len = 0;
 407
 408     while (*psz && ((!buf) || (len < n)))
 409     {
 410         unsigned char cc = *psz++;
 411         if (cc != '+')
 412         {
 413             // plain ASCII char
 414             if (buf)
 415                 *buf++ = cc;
 416             len++;
 417         }
 418         else if (*psz == '-')
 419         {
 420             // encoded plus sign
 421             if (buf)
 422                 *buf++ = cc;
 423             len++;
 424             psz++;
 425         }
 426         else
 427         {
 428             // BASE64 encoded string
 429             bool lsb;
 430             unsigned char c;
 431             unsigned int d, l;
 432             for (lsb = false, d = 0, l = 0;
 433                 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
 434             {
 435                 d <<= 6;
 436                 d += cc;
 437                 for (l += 6; l >= 8; lsb = !lsb)
 438                 {
 439                     c = (unsigned char)((d >> (l -= 8)) % 256);
 440                     if (lsb)
 441                     {
 442                         if (buf)
 443                             *buf++ |= c;
 444                         len ++;
 445                     }
 446                     else
 447                         if (buf)
 448                             *buf = (wchar_t)(c << 8);
 449                 }
 450             }
 451             if (*psz == '-')
 452                 psz++;
 453         }
 454     }
 455     if (buf && (len < n))
 456         *buf = 0;
 457     return len;
 458 }
 459
 460 //
 461 // BASE64 encoding table
 462 //
 463 static const unsigned char utf7enb64[] =
 464 {
 465     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 466     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 467     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 468     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 469     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 470     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 471     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 472     '4', '5', '6', '7', '8', '9', '+', '/'
 473 };
 474
 475 //
 476 // UTF-7 encoding table
 477 //
 478 // 0 - Set D (directly encoded characters)
 479 // 1 - Set O (optional direct characters)
 480 // 2 - whitespace characters (optional)
 481 // 3 - special characters
 482 //
 483 static const unsigned char utf7encode[128] =
 484 {
 485     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 486     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 487     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 488     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 489     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 490     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 491     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 492     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 493 };
 494
 495 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 496 {
 497
 498
 499     size_t len = 0;
 500
 501     while (*psz && ((!buf) || (len < n)))
 502     {
 503         wchar_t cc = *psz++;
 504         if (cc < 0x80 && utf7encode[cc] < 1)
 505         {
 506             // plain ASCII char
 507             if (buf)
 508                 *buf++ = (char)cc;
 509             len++;
 510         }
 511 #ifndef WC_UTF16
 512         else if (((wxUint32)cc) > 0xffff)
 513             {
 514             // no surrogate pair generation (yet?)
 515             return (size_t)-1;
 516         }
 517 #endif
 518         else
 519         {
 520             if (buf)
 521                 *buf++ = '+';
 522             len++;
 523             if (cc != '+')
 524             {
 525                 // BASE64 encode string
 526                 unsigned int lsb, d, l;
 527                 for (d = 0, l = 0;; psz++)
 528                 {
 529                     for (lsb = 0; lsb < 2; lsb ++)
 530                     {
 531                         d <<= 8;
 532                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 533
 534                         for (l += 8; l >= 6; )
 535                         {
 536                             l -= 6;
 537                             if (buf)
 538                                 *buf++ = utf7enb64[(d >> l) % 64];
 539                             len++;
 540                         }
 541                     }
 542                     cc = *psz;
 543                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 544                         break;
 545                 }
 546                 if (l != 0)
 547                 {
 548                     if (buf)
 549                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 550                     len++;
 551                 }
 552             }
 553             if (buf)
 554                 *buf++ = '-';
 555             len++;
 556         }
 557     }
 558     if (buf && (len < n))
 559         *buf = 0;
 560     return len;
 561 }
 562
 563 // ----------------------------------------------------------------------------
 564 // UTF-8
 565 // ----------------------------------------------------------------------------
 566
 567 static wxUint32 utf8_max[]=
 568     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 569
 570 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 571 {
 572     size_t len = 0;
 573
 574     while (*psz && ((!buf) || (len < n)))
 575     {
 576         unsigned char cc = *psz++, fc = cc;
 577         unsigned cnt;
 578         for (cnt = 0; fc & 0x80; cnt++)
 579             fc <<= 1;
 580         if (!cnt)
 581         {
 582             // plain ASCII char
 583             if (buf)
 584                 *buf++ = cc;
 585             len++;
 586         }
 587         else
 588         {
 589             cnt--;
 590             if (!cnt)
 591             {
 592                 // invalid UTF-8 sequence
 593                 return (size_t)-1;
 594             }
 595             else
 596             {
 597                 unsigned ocnt = cnt - 1;
 598                 wxUint32 res = cc & (0x3f >> cnt);
 599                 while (cnt--)
 600                 {
 601                     cc = *psz++;
 602                     if ((cc & 0xC0) != 0x80)
 603                     {
 604                         // invalid UTF-8 sequence
 605                         return (size_t)-1;
 606                     }
 607                     res = (res << 6) | (cc & 0x3f);
 608                 }
 609                 if (res <= utf8_max[ocnt])
 610                 {
 611                     // illegal UTF-8 encoding
 612                     return (size_t)-1;
 613                 }
 614 #ifdef WC_UTF16
 615                 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 616                 size_t pa = encode_utf16(res, (wxUint16 *)buf);
 617                 if (pa == (size_t)-1)
 618                   return (size_t)-1;
 619                 if (buf)
 620                     buf += pa;
 621                 len += pa;
 622 #else // !WC_UTF16
 623                 if (buf)
 624                     *buf++ = res;
 625                 len++;
 626 #endif // WC_UTF16/!WC_UTF16
 627             }
 628         }
 629     }
 630     if (buf && (len < n))
 631         *buf = 0;
 632     return len;
 633 }
 634
 635 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 636 {
 637     size_t len = 0;
 638
 639     while (*psz && ((!buf) || (len < n)))
 640     {
 641         wxUint32 cc;
 642 #ifdef WC_UTF16
 643         // cast is ok for WC_UTF16
 644         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 645         psz += (pa == (size_t)-1) ? 1 : pa;
 646 #else
 647         cc=(*psz++) & 0x7fffffff;
 648 #endif
 649         unsigned cnt;
 650         for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 651         if (!cnt)
 652         {
 653             // plain ASCII char
 654             if (buf)
 655                 *buf++ = (char) cc;
 656             len++;
 657         }
 658
 659         else
 660         {
 661             len += cnt + 1;
 662             if (buf)
 663             {
 664                 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 665                 while (cnt--)
 666                     *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 667             }
 668         }
 669     }
 670
 671     if (buf && (len<n)) *buf = 0;
 672
 673     return len;
 674 }
 675
 676
 677
 678
 679 // ----------------------------------------------------------------------------
 680 // UTF-16
 681 // ----------------------------------------------------------------------------
 682
 683 #ifdef WORDS_BIGENDIAN
 684     #define wxMBConvUTF16straight wxMBConvUTF16BE
 685     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 686 #else
 687     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 688     #define wxMBConvUTF16straight wxMBConvUTF16LE
 689 #endif
 690
 691
 692 #ifdef WC_UTF16
 693
 694 // copy 16bit MB to 16bit String
 695 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 696 {
 697     size_t len=0;
 698
 699     while (*(wxUint16*)psz && (!buf || len < n))
 700     {
 701         if (buf)
 702             *buf++ = *(wxUint16*)psz;
 703         len++;
 704
 705         psz += sizeof(wxUint16);
 706     }
 707     if (buf && len<n)   *buf=0;
 708
 709     return len;
 710 }
 711
 712
 713 // copy 16bit String to 16bit MB
 714 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 715 {
 716     size_t len=0;
 717
 718     while (*psz && (!buf || len < n))
 719     {
 720         if (buf)
 721         {
 722             *(wxUint16*)buf = *psz;
 723             buf += sizeof(wxUint16);
 724         }
 725         len += sizeof(wxUint16);
 726         psz++;
 727     }
 728     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 729
 730     return len;
 731 }
 732
 733
 734 // swap 16bit MB to 16bit String
 735 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 736 {
 737     size_t len=0;
 738
 739     while (*(wxUint16*)psz && (!buf || len < n))
 740     {
 741         if (buf)
 742         {
 743             ((char *)buf)[0] = psz[1];
 744             ((char *)buf)[1] = psz[0];
 745             buf++;
 746         }
 747         len++;
 748         psz += sizeof(wxUint16);
 749     }
 750     if (buf && len<n)   *buf=0;
 751
 752     return len;
 753 }
 754
 755
 756 // swap 16bit MB to 16bit String
 757 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 758 {
 759     size_t len=0;
 760
 761     while (*psz && (!buf || len < n))
 762     {
 763         if (buf)
 764         {
 765             *buf++ = ((char*)psz)[1];
 766             *buf++ = ((char*)psz)[0];
 767         }
 768         len += sizeof(wxUint16);
 769         psz++;
 770     }
 771     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 772
 773     return len;
 774 }
 775
 776
 777 #else // WC_UTF16
 778
 779
 780 // copy 16bit MB to 32bit String
 781 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 782 {
 783     size_t len=0;
 784
 785     while (*(wxUint16*)psz && (!buf || len < n))
 786     {
 787         wxUint32 cc;
 788         size_t pa=decode_utf16((wxUint16*)psz, cc);
 789         if (pa == (size_t)-1)
 790             return pa;
 791
 792         if (buf)
 793             *buf++ = cc;
 794         len++;
 795         psz += pa * sizeof(wxUint16);
 796     }
 797     if (buf && len<n)   *buf=0;
 798
 799     return len;
 800 }
 801
 802
 803 // copy 32bit String to 16bit MB
 804 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 805 {
 806     size_t len=0;
 807
 808     while (*psz && (!buf || len < n))
 809     {
 810         wxUint16 cc[2];
 811         size_t pa=encode_utf16(*psz, cc);
 812
 813         if (pa == (size_t)-1)
 814             return pa;
 815
 816         if (buf)
 817         {
 818             *(wxUint16*)buf = cc[0];
 819             buf += sizeof(wxUint16);
 820             if (pa > 1)
 821             {
 822                 *(wxUint16*)buf = cc[1];
 823                 buf += sizeof(wxUint16);
 824             }
 825         }
 826
 827         len += pa*sizeof(wxUint16);
 828         psz++;
 829     }
 830     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 831
 832     return len;
 833 }
 834
 835
 836 // swap 16bit MB to 32bit String
 837 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 838 {
 839     size_t len=0;
 840
 841     while (*(wxUint16*)psz && (!buf || len < n))
 842     {
 843         wxUint32 cc;
 844         char tmp[4];
 845         tmp[0]=psz[1];  tmp[1]=psz[0];
 846         tmp[2]=psz[3];  tmp[3]=psz[2];
 847
 848         size_t pa=decode_utf16((wxUint16*)tmp, cc);
 849         if (pa == (size_t)-1)
 850             return pa;
 851
 852         if (buf)
 853             *buf++ = cc;
 854
 855         len++;
 856         psz += pa * sizeof(wxUint16);
 857     }
 858     if (buf && len<n)   *buf=0;
 859
 860     return len;
 861 }
 862
 863
 864 // swap 32bit String to 16bit MB
 865 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 866 {
 867     size_t len=0;
 868
 869     while (*psz && (!buf || len < n))
 870     {
 871         wxUint16 cc[2];
 872         size_t pa=encode_utf16(*psz, cc);
 873
 874         if (pa == (size_t)-1)
 875             return pa;
 876
 877         if (buf)
 878         {
 879             *buf++ = ((char*)cc)[1];
 880             *buf++ = ((char*)cc)[0];
 881             if (pa > 1)
 882             {
 883                 *buf++ = ((char*)cc)[3];
 884                 *buf++ = ((char*)cc)[2];
 885             }
 886         }
 887
 888         len += pa*sizeof(wxUint16);
 889         psz++;
 890     }
 891     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 892
 893     return len;
 894 }
 895
 896 #endif // WC_UTF16
 897
 898
 899 // ----------------------------------------------------------------------------
 900 // UTF-32
 901 // ----------------------------------------------------------------------------
 902
 903 #ifdef WORDS_BIGENDIAN
 904 #define wxMBConvUTF32straight  wxMBConvUTF32BE
 905 #define wxMBConvUTF32swap      wxMBConvUTF32LE
 906 #else
 907 #define wxMBConvUTF32swap      wxMBConvUTF32BE
 908 #define wxMBConvUTF32straight  wxMBConvUTF32LE
 909 #endif
 910
 911
 912 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
 913 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
 914
 915
 916 #ifdef WC_UTF16
 917
 918 // copy 32bit MB to 16bit String
 919 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 920 {
 921     size_t len=0;
 922
 923     while (*(wxUint32*)psz && (!buf || len < n))
 924     {
 925         wxUint16 cc[2];
 926
 927         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
 928         if (pa == (size_t)-1)
 929             return pa;
 930
 931         if (buf)
 932         {
 933             *buf++ = cc[0];
 934             if (pa > 1)
 935                 *buf++ = cc[1];
 936         }
 937         len += pa;
 938         psz += sizeof(wxUint32);
 939     }
 940     if (buf && len<n)   *buf=0;
 941
 942     return len;
 943 }
 944
 945
 946 // copy 16bit String to 32bit MB
 947 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 948 {
 949     size_t len=0;
 950
 951     while (*psz && (!buf || len < n))
 952     {
 953         wxUint32 cc;
 954
 955         // cast is ok for WC_UTF16
 956         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 957         if (pa == (size_t)-1)
 958             return pa;
 959
 960         if (buf)
 961         {
 962             *(wxUint32*)buf = cc;
 963             buf += sizeof(wxUint32);
 964         }
 965         len += sizeof(wxUint32);
 966         psz += pa;
 967     }
 968
 969     if (buf && len<=n-sizeof(wxUint32))
 970         *(wxUint32*)buf=0;
 971
 972     return len;
 973 }
 974
 975
 976
 977 // swap 32bit MB to 16bit String
 978 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 979 {
 980     size_t len=0;
 981
 982     while (*(wxUint32*)psz && (!buf || len < n))
 983     {
 984         char tmp[4];
 985         tmp[0] = psz[3];   tmp[1] = psz[2];
 986         tmp[2] = psz[1];   tmp[3] = psz[0];
 987
 988
 989         wxUint16 cc[2];
 990
 991         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
 992         if (pa == (size_t)-1)
 993             return pa;
 994
 995         if (buf)
 996         {
 997             *buf++ = cc[0];
 998             if (pa > 1)
 999                 *buf++ = cc[1];
1000         }
1001         len += pa;
1002         psz += sizeof(wxUint32);
1003     }
1004
1005     if (buf && len<n)
1006         *buf=0;
1007
1008     return len;
1009 }
1010
1011
1012 // swap 16bit String to 32bit MB
1013 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1014 {
1015     size_t len=0;
1016
1017     while (*psz && (!buf || len < n))
1018     {
1019         char cc[4];
1020
1021         // cast is ok for WC_UTF16
1022         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
1023         if (pa == (size_t)-1)
1024             return pa;
1025
1026         if (buf)
1027         {
1028             *buf++ = cc[3];
1029             *buf++ = cc[2];
1030             *buf++ = cc[1];
1031             *buf++ = cc[0];
1032         }
1033         len += sizeof(wxUint32);
1034         psz += pa;
1035     }
1036
1037     if (buf && len<=n-sizeof(wxUint32))
1038         *(wxUint32*)buf=0;
1039
1040     return len;
1041 }
1042
1043 #else // WC_UTF16
1044
1045
1046 // copy 32bit MB to 32bit String
1047 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1048 {
1049     size_t len=0;
1050
1051     while (*(wxUint32*)psz && (!buf || len < n))
1052     {
1053         if (buf)
1054             *buf++ = *(wxUint32*)psz;
1055         len++;
1056         psz += sizeof(wxUint32);
1057     }
1058
1059     if (buf && len<n)
1060         *buf=0;
1061
1062     return len;
1063 }
1064
1065
1066 // copy 32bit String to 32bit MB
1067 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1068 {
1069     size_t len=0;
1070
1071     while (*psz && (!buf || len < n))
1072     {
1073         if (buf)
1074         {
1075             *(wxUint32*)buf = *psz;
1076             buf += sizeof(wxUint32);
1077         }
1078
1079         len += sizeof(wxUint32);
1080         psz++;
1081     }
1082
1083     if (buf && len<=n-sizeof(wxUint32))
1084         *(wxUint32*)buf=0;
1085
1086     return len;
1087 }
1088
1089
1090 // swap 32bit MB to 32bit String
1091 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1092 {
1093     size_t len=0;
1094
1095     while (*(wxUint32*)psz && (!buf || len < n))
1096     {
1097         if (buf)
1098         {
1099             ((char *)buf)[0] = psz[3];
1100             ((char *)buf)[1] = psz[2];
1101             ((char *)buf)[2] = psz[1];
1102             ((char *)buf)[3] = psz[0];
1103             buf++;
1104         }
1105         len++;
1106         psz += sizeof(wxUint32);
1107     }
1108
1109     if (buf && len<n)
1110         *buf=0;
1111
1112     return len;
1113 }
1114
1115
1116 // swap 32bit String to 32bit MB
1117 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1118 {
1119     size_t len=0;
1120
1121     while (*psz && (!buf || len < n))
1122     {
1123         if (buf)
1124         {
1125             *buf++ = ((char *)psz)[3];
1126             *buf++ = ((char *)psz)[2];
1127             *buf++ = ((char *)psz)[1];
1128             *buf++ = ((char *)psz)[0];
1129         }
1130         len += sizeof(wxUint32);
1131         psz++;
1132     }
1133
1134     if (buf && len<=n-sizeof(wxUint32))
1135         *(wxUint32*)buf=0;
1136
1137     return len;
1138 }
1139
1140
1141 #endif // WC_UTF16
1142
1143
1144 // ============================================================================
1145 // The classes doing conversion using the iconv_xxx() functions
1146 // ============================================================================
1147
1148 #ifdef HAVE_ICONV
1149
1150 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with
1151 //     E2BIG if output buffer is _exactly_ as big as needed. Such case is
1152 //     (unless there's yet another bug in glibc) the only case when iconv()
1153 //     returns with (size_t)-1 (which means error) and says there are 0 bytes
1154 //     left in the input buffer -- when _real_ error occurs,
1155 //     bytes-left-in-input buffer is non-zero. Hence, this alternative test for
1156 //     iconv() failure.
1157 //     [This bug does not appear in glibc 2.2.]
1158 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1159 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1160                                      (errno != E2BIG || bufLeft != 0))
1161 #else
1162 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1163 #endif
1164
1165 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1166
1167 // ----------------------------------------------------------------------------
1168 // wxMBConv_iconv: encapsulates an iconv character set
1169 // ----------------------------------------------------------------------------
1170
1171 class wxMBConv_iconv : public wxMBConv
1172 {
1173 public:
1174     wxMBConv_iconv(const wxChar *name);
1175     virtual ~wxMBConv_iconv();
1176
1177     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1178     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1179
1180     bool IsOk() const
1181         { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1182
1183 protected:
1184     // the iconv handlers used to translate from multibyte to wide char and in
1185     // the other direction
1186     iconv_t m2w,
1187             w2m;
1188 #if wxUSE_THREADS
1189     // guards access to m2w and w2m objects
1190     wxMutex m_iconvMutex;
1191 #endif
1192
1193 private:
1194     // the name (for iconv_open()) of a wide char charset -- if none is
1195     // available on this machine, it will remain NULL
1196     static const char *ms_wcCharsetName;
1197
1198     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1199     // different endian-ness than the native one
1200     static bool ms_wcNeedsSwap;
1201 };
1202
1203 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1204 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1205
1206 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1207 {
1208     // Do it the hard way
1209     char cname[100];
1210     for (size_t i = 0; i < wxStrlen(name)+1; i++)
1211         cname[i] = (char) name[i];
1212
1213     // check for charset that represents wchar_t:
1214     if (ms_wcCharsetName == NULL)
1215     {
1216         ms_wcNeedsSwap = false;
1217
1218         // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1219         ms_wcCharsetName = WC_NAME_BEST;
1220         m2w = iconv_open(ms_wcCharsetName, cname);
1221
1222         if (m2w == (iconv_t)-1)
1223         {
1224             // try charset w/o bytesex info (e.g. "UCS4")
1225             // and check for bytesex ourselves:
1226             ms_wcCharsetName = WC_NAME;
1227             m2w = iconv_open(ms_wcCharsetName, cname);
1228
1229             // last bet, try if it knows WCHAR_T pseudo-charset
1230             if (m2w == (iconv_t)-1)
1231             {
1232                 ms_wcCharsetName = "WCHAR_T";
1233                 m2w = iconv_open(ms_wcCharsetName, cname);
1234             }
1235
1236             if (m2w != (iconv_t)-1)
1237             {
1238                 char    buf[2], *bufPtr;
1239                 wchar_t wbuf[2], *wbufPtr;
1240                 size_t  insz, outsz;
1241                 size_t  res;
1242
1243                 buf[0] = 'A';
1244                 buf[1] = 0;
1245                 wbuf[0] = 0;
1246                 insz = 2;
1247                 outsz = SIZEOF_WCHAR_T * 2;
1248                 wbufPtr = wbuf;
1249                 bufPtr = buf;
1250
1251                 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1252                             (char**)&wbufPtr, &outsz);
1253
1254                 if (ICONV_FAILED(res, insz))
1255                 {
1256                     ms_wcCharsetName = NULL;
1257                     wxLogLastError(wxT("iconv"));
1258                     wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1259                 }
1260                 else
1261                 {
1262                     ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1263                 }
1264             }
1265             else
1266             {
1267                 ms_wcCharsetName = NULL;
1268
1269                 // VS: we must not output an error here, since wxWidgets will safely
1270                 //     fall back to using wxEncodingConverter.
1271                 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1272                 //wxLogError(
1273             }
1274         }
1275         wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1276     }
1277     else // we already have ms_wcCharsetName
1278     {
1279         m2w = iconv_open(ms_wcCharsetName, cname);
1280     }
1281
1282     // NB: don't ever pass NULL to iconv_open(), it may crash!
1283     if ( ms_wcCharsetName )
1284     {
1285         w2m = iconv_open( cname, ms_wcCharsetName);
1286     }
1287     else
1288     {
1289         w2m = (iconv_t)-1;
1290     }
1291 }
1292
1293 wxMBConv_iconv::~wxMBConv_iconv()
1294 {
1295     if ( m2w != (iconv_t)-1 )
1296         iconv_close(m2w);
1297     if ( w2m != (iconv_t)-1 )
1298         iconv_close(w2m);
1299 }
1300
1301 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1302 {
1303 #if wxUSE_THREADS
1304     // NB: iconv() is MT-safe, but each thread must use it's own iconv_t handle.
1305     //     Unfortunately there is a couple of global wxCSConv objects such as
1306     //     wxConvLocal that are used all over wx code, so we have to make sure
1307     //     the handle is used by at most one thread at the time. Otherwise
1308     //     only a few wx classes would be safe to use from non-main threads
1309     //     as MB<->WC conversion would fail "randomly".
1310     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1311 #endif
1312
1313     size_t inbuf = strlen(psz);
1314     size_t outbuf = n * SIZEOF_WCHAR_T;
1315     size_t res, cres;
1316     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1317     wchar_t *bufPtr = buf;
1318     const char *pszPtr = psz;
1319
1320     if (buf)
1321     {
1322         // have destination buffer, convert there
1323         cres = iconv(m2w,
1324                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1325                      (char**)&bufPtr, &outbuf);
1326         res = n - (outbuf / SIZEOF_WCHAR_T);
1327
1328         if (ms_wcNeedsSwap)
1329         {
1330             // convert to native endianness
1331             WC_BSWAP(buf /* _not_ bufPtr */, res)
1332         }
1333
1334         // NB: iconv was given only strlen(psz) characters on input, and so
1335         //     it couldn't convert the trailing zero. Let's do it ourselves
1336         //     if there's some room left for it in the output buffer.
1337         if (res < n)
1338             buf[res] = 0;
1339     }
1340     else
1341     {
1342         // no destination buffer... convert using temp buffer
1343         // to calculate destination buffer requirement
1344         wchar_t tbuf[8];
1345         res = 0;
1346         do {
1347             bufPtr = tbuf;
1348             outbuf = 8*SIZEOF_WCHAR_T;
1349
1350             cres = iconv(m2w,
1351                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1352                          (char**)&bufPtr, &outbuf );
1353
1354             res += 8-(outbuf/SIZEOF_WCHAR_T);
1355         } while ((cres==(size_t)-1) && (errno==E2BIG));
1356     }
1357
1358     if (ICONV_FAILED(cres, inbuf))
1359     {
1360         //VS: it is ok if iconv fails, hence trace only
1361         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1362         return (size_t)-1;
1363     }
1364
1365     return res;
1366 }
1367
1368 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1369 {
1370 #if wxUSE_THREADS
1371     // NB: explained in MB2WC
1372     wxMutexLocker lock(wxConstCast(this, wxMBConv_iconv)->m_iconvMutex);
1373 #endif
1374
1375     size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1376     size_t outbuf = n;
1377     size_t res, cres;
1378
1379     wchar_t *tmpbuf = 0;
1380
1381     if (ms_wcNeedsSwap)
1382     {
1383         // need to copy to temp buffer to switch endianness
1384         // this absolutely doesn't rock!
1385         // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1386         //  could be in read-only memory, or be accessed in some other thread)
1387         tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1388         memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1389         WC_BSWAP(tmpbuf, inbuf)
1390         psz=tmpbuf;
1391     }
1392
1393     if (buf)
1394     {
1395         // have destination buffer, convert there
1396         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1397
1398         res = n-outbuf;
1399
1400         // NB: iconv was given only wcslen(psz) characters on input, and so
1401         //     it couldn't convert the trailing zero. Let's do it ourselves
1402         //     if there's some room left for it in the output buffer.
1403         if (res < n)
1404             buf[0] = 0;
1405     }
1406     else
1407     {
1408         // no destination buffer... convert using temp buffer
1409         // to calculate destination buffer requirement
1410         char tbuf[16];
1411         res = 0;
1412         do {
1413             buf = tbuf; outbuf = 16;
1414
1415             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1416
1417             res += 16 - outbuf;
1418         } while ((cres==(size_t)-1) && (errno==E2BIG));
1419     }
1420
1421     if (ms_wcNeedsSwap)
1422     {
1423         free(tmpbuf);
1424     }
1425
1426     if (ICONV_FAILED(cres, inbuf))
1427     {
1428         //VS: it is ok if iconv fails, hence trace only
1429         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1430         return (size_t)-1;
1431     }
1432
1433     return res;
1434 }
1435
1436 #endif // HAVE_ICONV
1437
1438
1439 // ============================================================================
1440 // Win32 conversion classes
1441 // ============================================================================
1442
1443 #ifdef wxHAVE_WIN32_MB2WC
1444
1445 // from utils.cpp
1446 #if wxUSE_FONTMAP
1447 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1448 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1449 #endif
1450
1451 class wxMBConv_win32 : public wxMBConv
1452 {
1453 public:
1454     wxMBConv_win32()
1455     {
1456         m_CodePage = CP_ACP;
1457     }
1458
1459 #if wxUSE_FONTMAP
1460     wxMBConv_win32(const wxChar* name)
1461     {
1462         m_CodePage = wxCharsetToCodepage(name);
1463     }
1464
1465     wxMBConv_win32(wxFontEncoding encoding)
1466     {
1467         m_CodePage = wxEncodingToCodepage(encoding);
1468     }
1469 #endif
1470
1471     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1472     {
1473         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1474         // the behaviour is not compatible with the Unix version (using iconv)
1475         // and break the library itself, e.g. wxTextInputStream::NextChar()
1476         // wouldn't work if reading an incomplete MB char didn't result in an
1477         // error
1478         //
1479         // note however that using MB_ERR_INVALID_CHARS with CP_UTF7 results in
1480         // an error (tested under Windows Server 2003) and apparently it is
1481         // done on purpose, i.e. the function accepts any input in this case
1482         // and although I'd prefer to return error on ill-formed output, our
1483         // own wxMBConvUTF7 doesn't detect errors (e.g. lone "+" which is
1484         // explicitly ill-formed according to RFC 2152) neither so we don't
1485         // even have any fallback here...
1486         int flags = m_CodePage == CP_UTF7 ? 0 : MB_ERR_INVALID_CHARS;
1487
1488         const size_t len = ::MultiByteToWideChar
1489                              (
1490                                 m_CodePage,     // code page
1491                                 flags,          // flags: fall on error
1492                                 psz,            // input string
1493                                 -1,             // its length (NUL-terminated)
1494                                 buf,            // output string
1495                                 buf ? n : 0     // size of output buffer
1496                              );
1497
1498         // note that it returns count of written chars for buf != NULL and size
1499         // of the needed buffer for buf == NULL so in either case the length of
1500         // the string (which never includes the terminating NUL) is one less
1501         return len ? len - 1 : (size_t)-1;
1502     }
1503
1504     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1505     {
1506         /*
1507             we have a problem here: by default, WideCharToMultiByte() may
1508             replace characters unrepresentable in the target code page with bad
1509             quality approximations such as turning "1/2" symbol (U+00BD) into
1510             "1" for the code pages which don't have it and we, obviously, want
1511             to avoid this at any price
1512
1513             the trouble is that this function does it _silently_, i.e. it won't
1514             even tell us whether it did or not... Win98/2000 and higher provide
1515             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1516             we have to resort to a round trip, i.e. check that converting back
1517             results in the same string -- this is, of course, expensive but
1518             otherwise we simply can't be sure to not garble the data.
1519          */
1520
1521         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1522         // it doesn't work with CJK encodings (which we test for rather roughly
1523         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1524         // supporting it
1525         BOOL usedDef wxDUMMY_INITIALIZE(false);
1526         BOOL *pUsedDef;
1527         int flags;
1528         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1529         {
1530             // it's our lucky day
1531             flags = WC_NO_BEST_FIT_CHARS;
1532             pUsedDef = &usedDef;
1533         }
1534         else // old system or unsupported encoding
1535         {
1536             flags = 0;
1537             pUsedDef = NULL;
1538         }
1539
1540         const size_t len = ::WideCharToMultiByte
1541                              (
1542                                 m_CodePage,     // code page
1543                                 flags,          // either none or no best fit
1544                                 pwz,            // input string
1545                                 -1,             // it is (wide) NUL-terminated
1546                                 buf,            // output buffer
1547                                 buf ? n : 0,    // and its size
1548                                 NULL,           // default "replacement" char
1549                                 pUsedDef        // [out] was it used?
1550                              );
1551
1552         if ( !len )
1553         {
1554             // function totally failed
1555             return (size_t)-1;
1556         }
1557
1558         // if we were really converting, check if we succeeded
1559         if ( buf )
1560         {
1561             if ( flags )
1562             {
1563                 // check if the conversion failed, i.e. if any replacements
1564                 // were done
1565                 if ( usedDef )
1566                     return (size_t)-1;
1567             }
1568             else // we must resort to double tripping...
1569             {
1570                 wxWCharBuffer wcBuf(n);
1571                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1572                         wcscmp(wcBuf, pwz) != 0 )
1573                 {
1574                     // we didn't obtain the same thing we started from, hence
1575                     // the conversion was lossy and we consider that it failed
1576                     return (size_t)-1;
1577                 }
1578             }
1579         }
1580
1581         // see the comment above for the reason of "len - 1"
1582         return len - 1;
1583     }
1584
1585     bool IsOk() const { return m_CodePage != -1; }
1586
1587 private:
1588     static bool CanUseNoBestFit()
1589     {
1590         static int s_isWin98Or2k = -1;
1591
1592         if ( s_isWin98Or2k == -1 )
1593         {
1594             int verMaj, verMin;
1595             switch ( wxGetOsVersion(&verMaj, &verMin) )
1596             {
1597                 case wxWIN95:
1598                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1599                     break;
1600
1601                 case wxWINDOWS_NT:
1602                     s_isWin98Or2k = verMaj >= 5;
1603                     break;
1604
1605                 default:
1606                     // unknown, be conseravtive by default
1607                     s_isWin98Or2k = 0;
1608             }
1609
1610             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1611         }
1612
1613         return s_isWin98Or2k == 1;
1614     }
1615
1616     long m_CodePage;
1617 };
1618
1619 #endif // wxHAVE_WIN32_MB2WC
1620
1621 // ============================================================================
1622 // Cocoa conversion classes
1623 // ============================================================================
1624
1625 #if defined(__WXCOCOA__)
1626
1627 // RN:  There is no UTF-32 support in either Core Foundation or
1628 // Cocoa.  Strangely enough, internally Core Foundation uses
1629 // UTF 32 internally quite a bit - its just not public (yet).
1630
1631 #include <CoreFoundation/CFString.h>
1632 #include <CoreFoundation/CFStringEncodingExt.h>
1633
1634 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1635 {
1636     CFStringEncoding enc = kCFStringEncodingInvalidId ;
1637     if ( encoding == wxFONTENCODING_DEFAULT )
1638     {
1639         enc = CFStringGetSystemEncoding();
1640     }
1641     else switch( encoding)
1642     {
1643         case wxFONTENCODING_ISO8859_1 :
1644             enc = kCFStringEncodingISOLatin1 ;
1645             break ;
1646         case wxFONTENCODING_ISO8859_2 :
1647             enc = kCFStringEncodingISOLatin2;
1648             break ;
1649         case wxFONTENCODING_ISO8859_3 :
1650             enc = kCFStringEncodingISOLatin3 ;
1651             break ;
1652         case wxFONTENCODING_ISO8859_4 :
1653             enc = kCFStringEncodingISOLatin4;
1654             break ;
1655         case wxFONTENCODING_ISO8859_5 :
1656             enc = kCFStringEncodingISOLatinCyrillic;
1657             break ;
1658         case wxFONTENCODING_ISO8859_6 :
1659             enc = kCFStringEncodingISOLatinArabic;
1660             break ;
1661         case wxFONTENCODING_ISO8859_7 :
1662             enc = kCFStringEncodingISOLatinGreek;
1663             break ;
1664         case wxFONTENCODING_ISO8859_8 :
1665             enc = kCFStringEncodingISOLatinHebrew;
1666             break ;
1667         case wxFONTENCODING_ISO8859_9 :
1668             enc = kCFStringEncodingISOLatin5;
1669             break ;
1670         case wxFONTENCODING_ISO8859_10 :
1671             enc = kCFStringEncodingISOLatin6;
1672             break ;
1673         case wxFONTENCODING_ISO8859_11 :
1674             enc = kCFStringEncodingISOLatinThai;
1675             break ;
1676         case wxFONTENCODING_ISO8859_13 :
1677             enc = kCFStringEncodingISOLatin7;
1678             break ;
1679         case wxFONTENCODING_ISO8859_14 :
1680             enc = kCFStringEncodingISOLatin8;
1681             break ;
1682         case wxFONTENCODING_ISO8859_15 :
1683             enc = kCFStringEncodingISOLatin9;
1684             break ;
1685
1686         case wxFONTENCODING_KOI8 :
1687             enc = kCFStringEncodingKOI8_R;
1688             break ;
1689         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1690             enc = kCFStringEncodingDOSRussian;
1691             break ;
1692
1693 //      case wxFONTENCODING_BULGARIAN :
1694 //          enc = ;
1695 //          break ;
1696
1697         case wxFONTENCODING_CP437 :
1698             enc =kCFStringEncodingDOSLatinUS ;
1699             break ;
1700         case wxFONTENCODING_CP850 :
1701             enc = kCFStringEncodingDOSLatin1;
1702             break ;
1703         case wxFONTENCODING_CP852 :
1704             enc = kCFStringEncodingDOSLatin2;
1705             break ;
1706         case wxFONTENCODING_CP855 :
1707             enc = kCFStringEncodingDOSCyrillic;
1708             break ;
1709         case wxFONTENCODING_CP866 :
1710             enc =kCFStringEncodingDOSRussian ;
1711             break ;
1712         case wxFONTENCODING_CP874 :
1713             enc = kCFStringEncodingDOSThai;
1714             break ;
1715         case wxFONTENCODING_CP932 :
1716             enc = kCFStringEncodingDOSJapanese;
1717             break ;
1718         case wxFONTENCODING_CP936 :
1719             enc =kCFStringEncodingDOSChineseSimplif ;
1720             break ;
1721         case wxFONTENCODING_CP949 :
1722             enc = kCFStringEncodingDOSKorean;
1723             break ;
1724         case wxFONTENCODING_CP950 :
1725             enc = kCFStringEncodingDOSChineseTrad;
1726             break ;
1727         case wxFONTENCODING_CP1250 :
1728             enc = kCFStringEncodingWindowsLatin2;
1729             break ;
1730         case wxFONTENCODING_CP1251 :
1731             enc =kCFStringEncodingWindowsCyrillic ;
1732             break ;
1733         case wxFONTENCODING_CP1252 :
1734             enc =kCFStringEncodingWindowsLatin1 ;
1735             break ;
1736         case wxFONTENCODING_CP1253 :
1737             enc = kCFStringEncodingWindowsGreek;
1738             break ;
1739         case wxFONTENCODING_CP1254 :
1740             enc = kCFStringEncodingWindowsLatin5;
1741             break ;
1742         case wxFONTENCODING_CP1255 :
1743             enc =kCFStringEncodingWindowsHebrew ;
1744             break ;
1745         case wxFONTENCODING_CP1256 :
1746             enc =kCFStringEncodingWindowsArabic ;
1747             break ;
1748         case wxFONTENCODING_CP1257 :
1749             enc = kCFStringEncodingWindowsBalticRim;
1750             break ;
1751 //   This only really encodes to UTF7 (if that) evidently
1752 //        case wxFONTENCODING_UTF7 :
1753 //            enc = kCFStringEncodingNonLossyASCII ;
1754 //            break ;
1755         case wxFONTENCODING_UTF8 :
1756             enc = kCFStringEncodingUTF8 ;
1757             break ;
1758         case wxFONTENCODING_EUC_JP :
1759             enc = kCFStringEncodingEUC_JP;
1760             break ;
1761         case wxFONTENCODING_UTF16 :
1762             enc = kCFStringEncodingUnicode ;
1763             break ;
1764         case wxFONTENCODING_MACROMAN :
1765             enc = kCFStringEncodingMacRoman ;
1766             break ;
1767         case wxFONTENCODING_MACJAPANESE :
1768             enc = kCFStringEncodingMacJapanese ;
1769             break ;
1770         case wxFONTENCODING_MACCHINESETRAD :
1771             enc = kCFStringEncodingMacChineseTrad ;
1772             break ;
1773         case wxFONTENCODING_MACKOREAN :
1774             enc = kCFStringEncodingMacKorean ;
1775             break ;
1776         case wxFONTENCODING_MACARABIC :
1777             enc = kCFStringEncodingMacArabic ;
1778             break ;
1779         case wxFONTENCODING_MACHEBREW :
1780             enc = kCFStringEncodingMacHebrew ;
1781             break ;
1782         case wxFONTENCODING_MACGREEK :
1783             enc = kCFStringEncodingMacGreek ;
1784             break ;
1785         case wxFONTENCODING_MACCYRILLIC :
1786             enc = kCFStringEncodingMacCyrillic ;
1787             break ;
1788         case wxFONTENCODING_MACDEVANAGARI :
1789             enc = kCFStringEncodingMacDevanagari ;
1790             break ;
1791         case wxFONTENCODING_MACGURMUKHI :
1792             enc = kCFStringEncodingMacGurmukhi ;
1793             break ;
1794         case wxFONTENCODING_MACGUJARATI :
1795             enc = kCFStringEncodingMacGujarati ;
1796             break ;
1797         case wxFONTENCODING_MACORIYA :
1798             enc = kCFStringEncodingMacOriya ;
1799             break ;
1800         case wxFONTENCODING_MACBENGALI :
1801             enc = kCFStringEncodingMacBengali ;
1802             break ;
1803         case wxFONTENCODING_MACTAMIL :
1804             enc = kCFStringEncodingMacTamil ;
1805             break ;
1806         case wxFONTENCODING_MACTELUGU :
1807             enc = kCFStringEncodingMacTelugu ;
1808             break ;
1809         case wxFONTENCODING_MACKANNADA :
1810             enc = kCFStringEncodingMacKannada ;
1811             break ;
1812         case wxFONTENCODING_MACMALAJALAM :
1813             enc = kCFStringEncodingMacMalayalam ;
1814             break ;
1815         case wxFONTENCODING_MACSINHALESE :
1816             enc = kCFStringEncodingMacSinhalese ;
1817             break ;
1818         case wxFONTENCODING_MACBURMESE :
1819             enc = kCFStringEncodingMacBurmese ;
1820             break ;
1821         case wxFONTENCODING_MACKHMER :
1822             enc = kCFStringEncodingMacKhmer ;
1823             break ;
1824         case wxFONTENCODING_MACTHAI :
1825             enc = kCFStringEncodingMacThai ;
1826             break ;
1827         case wxFONTENCODING_MACLAOTIAN :
1828             enc = kCFStringEncodingMacLaotian ;
1829             break ;
1830         case wxFONTENCODING_MACGEORGIAN :
1831             enc = kCFStringEncodingMacGeorgian ;
1832             break ;
1833         case wxFONTENCODING_MACARMENIAN :
1834             enc = kCFStringEncodingMacArmenian ;
1835             break ;
1836         case wxFONTENCODING_MACCHINESESIMP :
1837             enc = kCFStringEncodingMacChineseSimp ;
1838             break ;
1839         case wxFONTENCODING_MACTIBETAN :
1840             enc = kCFStringEncodingMacTibetan ;
1841             break ;
1842         case wxFONTENCODING_MACMONGOLIAN :
1843             enc = kCFStringEncodingMacMongolian ;
1844             break ;
1845         case wxFONTENCODING_MACETHIOPIC :
1846             enc = kCFStringEncodingMacEthiopic ;
1847             break ;
1848         case wxFONTENCODING_MACCENTRALEUR :
1849             enc = kCFStringEncodingMacCentralEurRoman ;
1850             break ;
1851         case wxFONTENCODING_MACVIATNAMESE :
1852             enc = kCFStringEncodingMacVietnamese ;
1853             break ;
1854         case wxFONTENCODING_MACARABICEXT :
1855             enc = kCFStringEncodingMacExtArabic ;
1856             break ;
1857         case wxFONTENCODING_MACSYMBOL :
1858             enc = kCFStringEncodingMacSymbol ;
1859             break ;
1860         case wxFONTENCODING_MACDINGBATS :
1861             enc = kCFStringEncodingMacDingbats ;
1862             break ;
1863         case wxFONTENCODING_MACTURKISH :
1864             enc = kCFStringEncodingMacTurkish ;
1865             break ;
1866         case wxFONTENCODING_MACCROATIAN :
1867             enc = kCFStringEncodingMacCroatian ;
1868             break ;
1869         case wxFONTENCODING_MACICELANDIC :
1870             enc = kCFStringEncodingMacIcelandic ;
1871             break ;
1872         case wxFONTENCODING_MACROMANIAN :
1873             enc = kCFStringEncodingMacRomanian ;
1874             break ;
1875         case wxFONTENCODING_MACCELTIC :
1876             enc = kCFStringEncodingMacCeltic ;
1877             break ;
1878         case wxFONTENCODING_MACGAELIC :
1879             enc = kCFStringEncodingMacGaelic ;
1880             break ;
1881 //      case wxFONTENCODING_MACKEYBOARD :
1882 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
1883 //          break ;
1884         default :
1885             // because gcc is picky
1886             break ;
1887     } ;
1888     return enc ;
1889 }
1890
1891 class wxMBConv_cocoa : public wxMBConv
1892 {
1893 public:
1894     wxMBConv_cocoa()
1895     {
1896         Init(CFStringGetSystemEncoding()) ;
1897     }
1898
1899     wxMBConv_cocoa(const wxChar* name)
1900     {
1901         Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
1902     }
1903
1904     wxMBConv_cocoa(wxFontEncoding encoding)
1905     {
1906         Init( wxCFStringEncFromFontEnc(encoding) );
1907     }
1908
1909     ~wxMBConv_cocoa()
1910     {
1911     }
1912
1913     void Init( CFStringEncoding encoding)
1914     {
1915         m_encoding = encoding ;
1916     }
1917
1918     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
1919     {
1920         wxASSERT(szUnConv);
1921
1922         CFStringRef theString = CFStringCreateWithBytes (
1923                                                 NULL, //the allocator
1924                                                 (const UInt8*)szUnConv,
1925                                                 strlen(szUnConv),
1926                                                 m_encoding,
1927                                                 false //no BOM/external representation
1928                                                 );
1929
1930         wxASSERT(theString);
1931
1932         size_t nOutLength = CFStringGetLength(theString);
1933
1934         if (szOut == NULL)
1935         {
1936             CFRelease(theString);
1937             return nOutLength;
1938         }
1939
1940         CFRange theRange = { 0, nOutSize };
1941
1942 #if SIZEOF_WCHAR_T == 4
1943         UniChar* szUniCharBuffer = new UniChar[nOutSize];
1944 #endif
1945
1946         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
1947
1948         CFRelease(theString);
1949
1950         szUniCharBuffer[nOutLength] = '\0' ;
1951
1952 #if SIZEOF_WCHAR_T == 4
1953         wxMBConvUTF16 converter ;
1954         converter.MB2WC(szOut, (const char*)szUniCharBuffer , nOutSize ) ;
1955         delete[] szUniCharBuffer;
1956 #endif
1957
1958         return nOutLength;
1959     }
1960
1961     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
1962     {
1963         wxASSERT(szUnConv);
1964
1965         size_t nRealOutSize;
1966         size_t nBufSize = wxWcslen(szUnConv);
1967         UniChar* szUniBuffer = (UniChar*) szUnConv;
1968
1969 #if SIZEOF_WCHAR_T == 4
1970         wxMBConvUTF16BE converter ;
1971         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
1972         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
1973         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
1974         nBufSize /= sizeof(UniChar);
1975 #endif
1976
1977         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
1978                                 NULL, //allocator
1979                                 szUniBuffer,
1980                                 nBufSize,
1981                                 kCFAllocatorNull //deallocator - we want to deallocate it ourselves
1982                             );
1983
1984         wxASSERT(theString);
1985
1986         //Note that CER puts a BOM when converting to unicode
1987         //so we  check and use getchars instead in that case
1988         if (m_encoding == kCFStringEncodingUnicode)
1989         {
1990             if (szOut != NULL)
1991                 CFStringGetCharacters(theString, CFRangeMake(0, nOutSize - 1), (UniChar*) szOut);
1992
1993             nRealOutSize = CFStringGetLength(theString) + 1;
1994         }
1995         else
1996         {
1997             CFStringGetBytes(
1998                 theString,
1999                 CFRangeMake(0, CFStringGetLength(theString)),
2000                 m_encoding,
2001                 0, //what to put in characters that can't be converted -
2002                     //0 tells CFString to return NULL if it meets such a character
2003                 false, //not an external representation
2004                 (UInt8*) szOut,
2005                 nOutSize,
2006                 (CFIndex*) &nRealOutSize
2007                         );
2008         }
2009
2010         CFRelease(theString);
2011
2012 #if SIZEOF_WCHAR_T == 4
2013         delete[] szUniBuffer;
2014 #endif
2015
2016         return  nRealOutSize - 1;
2017     }
2018
2019     bool IsOk() const
2020     {
2021         return m_encoding != kCFStringEncodingInvalidId &&
2022               CFStringIsEncodingAvailable(m_encoding);
2023     }
2024
2025 private:
2026     CFStringEncoding m_encoding ;
2027 };
2028
2029 #endif // defined(__WXCOCOA__)
2030
2031 // ============================================================================
2032 // Mac conversion classes
2033 // ============================================================================
2034
2035 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2036
2037 class wxMBConv_mac : public wxMBConv
2038 {
2039 public:
2040     wxMBConv_mac()
2041     {
2042         Init(CFStringGetSystemEncoding()) ;
2043     }
2044
2045     wxMBConv_mac(const wxChar* name)
2046     {
2047         Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
2048     }
2049
2050     wxMBConv_mac(wxFontEncoding encoding)
2051     {
2052         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2053     }
2054
2055     ~wxMBConv_mac()
2056     {
2057         OSStatus status = noErr ;
2058         status = TECDisposeConverter(m_MB2WC_converter);
2059         status = TECDisposeConverter(m_WC2MB_converter);
2060     }
2061
2062
2063     void Init( TextEncodingBase encoding)
2064     {
2065         OSStatus status = noErr ;
2066         m_char_encoding = encoding ;
2067         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2068
2069         status = TECCreateConverter(&m_MB2WC_converter,
2070                                     m_char_encoding,
2071                                     m_unicode_encoding);
2072         status = TECCreateConverter(&m_WC2MB_converter,
2073                                     m_unicode_encoding,
2074                                     m_char_encoding);
2075     }
2076
2077     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2078     {
2079         OSStatus status = noErr ;
2080         ByteCount byteOutLen ;
2081         ByteCount byteInLen = strlen(psz) ;
2082         wchar_t *tbuf = NULL ;
2083         UniChar* ubuf = NULL ;
2084         size_t res = 0 ;
2085
2086         if (buf == NULL)
2087         {
2088             //apple specs say at least 32
2089             n = wxMax( 32 , byteInLen ) ;
2090             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2091         }
2092         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2093 #if SIZEOF_WCHAR_T == 4
2094         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2095 #else
2096         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2097 #endif
2098         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2099           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2100 #if SIZEOF_WCHAR_T == 4
2101         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2102         // is not properly terminated we get random characters at the end
2103         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2104         wxMBConvUTF16BE converter ;
2105         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2106         free( ubuf ) ;
2107 #else
2108         res = byteOutLen / sizeof( UniChar ) ;
2109 #endif
2110         if ( buf == NULL )
2111              free(tbuf) ;
2112
2113         if ( buf  && res < n)
2114             buf[res] = 0;
2115
2116         return res ;
2117     }
2118
2119     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2120     {
2121         OSStatus status = noErr ;
2122         ByteCount byteOutLen ;
2123         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2124
2125         char *tbuf = NULL ;
2126
2127         if (buf == NULL)
2128         {
2129             //apple specs say at least 32
2130             n = wxMax( 32 , ((byteInLen / SIZEOF_WCHAR_T) * 8) + SIZEOF_WCHAR_T );
2131             tbuf = (char*) malloc( n ) ;
2132         }
2133
2134         ByteCount byteBufferLen = n ;
2135         UniChar* ubuf = NULL ;
2136 #if SIZEOF_WCHAR_T == 4
2137         wxMBConvUTF16BE converter ;
2138         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2139         byteInLen = unicharlen ;
2140         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2141         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2142 #else
2143         ubuf = (UniChar*) psz ;
2144 #endif
2145         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2146             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2147 #if SIZEOF_WCHAR_T == 4
2148         free( ubuf ) ;
2149 #endif
2150         if ( buf == NULL )
2151             free(tbuf) ;
2152
2153         size_t res = byteOutLen ;
2154         if ( buf  && res < n)
2155         {
2156             buf[res] = 0;
2157
2158             //we need to double-trip to verify it didn't insert any ? in place
2159             //of bogus characters
2160             wxWCharBuffer wcBuf(n);
2161             size_t pszlen = wxWcslen(psz);
2162             if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
2163                         wxWcslen(wcBuf) != pszlen ||
2164                         memcmp(wcBuf, psz, pszlen * sizeof(wchar_t)) != 0 )
2165             {
2166                 // we didn't obtain the same thing we started from, hence
2167                 // the conversion was lossy and we consider that it failed
2168                 return (size_t)-1;
2169             }
2170         }
2171
2172         return res ;
2173     }
2174
2175     bool IsOk() const
2176         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2177
2178 private:
2179     TECObjectRef m_MB2WC_converter ;
2180     TECObjectRef m_WC2MB_converter ;
2181
2182     TextEncodingBase m_char_encoding ;
2183     TextEncodingBase m_unicode_encoding ;
2184 };
2185
2186 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2187
2188 // ============================================================================
2189 // wxEncodingConverter based conversion classes
2190 // ============================================================================
2191
2192 #if wxUSE_FONTMAP
2193
2194 class wxMBConv_wxwin : public wxMBConv
2195 {
2196 private:
2197     void Init()
2198     {
2199         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2200                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2201     }
2202
2203 public:
2204     // temporarily just use wxEncodingConverter stuff,
2205     // so that it works while a better implementation is built
2206     wxMBConv_wxwin(const wxChar* name)
2207     {
2208         if (name)
2209             m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
2210         else
2211             m_enc = wxFONTENCODING_SYSTEM;
2212
2213         Init();
2214     }
2215
2216     wxMBConv_wxwin(wxFontEncoding enc)
2217     {
2218         m_enc = enc;
2219
2220         Init();
2221     }
2222
2223     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2224     {
2225         size_t inbuf = strlen(psz);
2226         if (buf)
2227         {
2228             if (!m2w.Convert(psz,buf))
2229                 return (size_t)-1;
2230         }
2231         return inbuf;
2232     }
2233
2234     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2235     {
2236         const size_t inbuf = wxWcslen(psz);
2237         if (buf)
2238         {
2239             if (!w2m.Convert(psz,buf))
2240                 return (size_t)-1;
2241         }
2242
2243         return inbuf;
2244     }
2245
2246     bool IsOk() const { return m_ok; }
2247
2248 public:
2249     wxFontEncoding m_enc;
2250     wxEncodingConverter m2w, w2m;
2251
2252     // were we initialized successfully?
2253     bool m_ok;
2254
2255     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2256 };
2257
2258 #endif // wxUSE_FONTMAP
2259
2260 // ============================================================================
2261 // wxCSConv implementation
2262 // ============================================================================
2263
2264 void wxCSConv::Init()
2265 {
2266     m_name = NULL;
2267     m_convReal =  NULL;
2268     m_deferred = true;
2269 }
2270
2271 wxCSConv::wxCSConv(const wxChar *charset)
2272 {
2273     Init();
2274
2275     if ( charset )
2276     {
2277         SetName(charset);
2278     }
2279
2280     m_encoding = wxFONTENCODING_SYSTEM;
2281 }
2282
2283 wxCSConv::wxCSConv(wxFontEncoding encoding)
2284 {
2285     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2286     {
2287         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2288
2289         encoding = wxFONTENCODING_SYSTEM;
2290     }
2291
2292     Init();
2293
2294     m_encoding = encoding;
2295 }
2296
2297 wxCSConv::~wxCSConv()
2298 {
2299     Clear();
2300 }
2301
2302 wxCSConv::wxCSConv(const wxCSConv& conv)
2303         : wxMBConv()
2304 {
2305     Init();
2306
2307     SetName(conv.m_name);
2308     m_encoding = conv.m_encoding;
2309 }
2310
2311 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2312 {
2313     Clear();
2314
2315     SetName(conv.m_name);
2316     m_encoding = conv.m_encoding;
2317
2318     return *this;
2319 }
2320
2321 void wxCSConv::Clear()
2322 {
2323     free(m_name);
2324     delete m_convReal;
2325
2326     m_name = NULL;
2327     m_convReal = NULL;
2328 }
2329
2330 void wxCSConv::SetName(const wxChar *charset)
2331 {
2332     if (charset)
2333     {
2334         m_name = wxStrdup(charset);
2335         m_deferred = true;
2336     }
2337 }
2338
2339 wxMBConv *wxCSConv::DoCreate() const
2340 {
2341     // check for the special case of ASCII or ISO8859-1 charset: as we have
2342     // special knowledge of it anyhow, we don't need to create a special
2343     // conversion object
2344     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2345     {
2346         // don't convert at all
2347         return NULL;
2348     }
2349
2350     // we trust OS to do conversion better than we can so try external
2351     // conversion methods first
2352     //
2353     // the full order is:
2354     //      1. OS conversion (iconv() under Unix or Win32 API)
2355     //      2. hard coded conversions for UTF
2356     //      3. wxEncodingConverter as fall back
2357
2358     // step (1)
2359 #ifdef HAVE_ICONV
2360 #if !wxUSE_FONTMAP
2361     if ( m_name )
2362 #endif // !wxUSE_FONTMAP
2363     {
2364         wxString name(m_name);
2365
2366 #if wxUSE_FONTMAP
2367         if ( name.empty() )
2368             name = wxFontMapper::Get()->GetEncodingName(m_encoding);
2369 #endif // wxUSE_FONTMAP
2370
2371         wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2372         if ( conv->IsOk() )
2373             return conv;
2374
2375         delete conv;
2376     }
2377 #endif // HAVE_ICONV
2378
2379 #ifdef wxHAVE_WIN32_MB2WC
2380     {
2381 #if wxUSE_FONTMAP
2382         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2383                                       : new wxMBConv_win32(m_encoding);
2384         if ( conv->IsOk() )
2385             return conv;
2386
2387         delete conv;
2388 #else
2389         return NULL;
2390 #endif
2391     }
2392 #endif // wxHAVE_WIN32_MB2WC
2393 #if defined(__WXMAC__)
2394     {
2395         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
2396         {
2397
2398             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2399                                         : new wxMBConv_mac(m_encoding);
2400             if ( conv->IsOk() )
2401                  return conv;
2402
2403             delete conv;
2404         }
2405     }
2406 #endif
2407 #if defined(__WXCOCOA__)
2408     {
2409         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2410         {
2411
2412             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2413                                           : new wxMBConv_cocoa(m_encoding);
2414             if ( conv->IsOk() )
2415                  return conv;
2416
2417             delete conv;
2418         }
2419     }
2420 #endif
2421     // step (2)
2422     wxFontEncoding enc = m_encoding;
2423 #if wxUSE_FONTMAP
2424     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2425     {
2426         // use "false" to suppress interactive dialogs -- we can be called from
2427         // anywhere and popping up a dialog from here is the last thing we want to
2428         // do
2429         enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
2430     }
2431 #endif // wxUSE_FONTMAP
2432
2433     switch ( enc )
2434     {
2435         case wxFONTENCODING_UTF7:
2436              return new wxMBConvUTF7;
2437
2438         case wxFONTENCODING_UTF8:
2439              return new wxMBConvUTF8;
2440
2441         case wxFONTENCODING_UTF16BE:
2442              return new wxMBConvUTF16BE;
2443
2444         case wxFONTENCODING_UTF16LE:
2445              return new wxMBConvUTF16LE;
2446
2447         case wxFONTENCODING_UTF32BE:
2448              return new wxMBConvUTF32BE;
2449
2450         case wxFONTENCODING_UTF32LE:
2451              return new wxMBConvUTF32LE;
2452
2453         default:
2454              // nothing to do but put here to suppress gcc warnings
2455              ;
2456     }
2457
2458     // step (3)
2459 #if wxUSE_FONTMAP
2460     {
2461         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2462                                       : new wxMBConv_wxwin(m_encoding);
2463         if ( conv->IsOk() )
2464             return conv;
2465
2466         delete conv;
2467     }
2468 #endif // wxUSE_FONTMAP
2469
2470     // NB: This is a hack to prevent deadlock. What could otherwise happen
2471     //     in Unicode build: wxConvLocal creation ends up being here
2472     //     because of some failure and logs the error. But wxLog will try to
2473     //     attach timestamp, for which it will need wxConvLocal (to convert
2474     //     time to char* and then wchar_t*), but that fails, tries to log
2475     //     error, but wxLog has a (already locked) critical section that
2476     //     guards static buffer.
2477     static bool alreadyLoggingError = false;
2478     if (!alreadyLoggingError)
2479     {
2480         alreadyLoggingError = true;
2481         wxLogError(_("Cannot convert from the charset '%s'!"),
2482                    m_name ? m_name
2483                       :
2484 #if wxUSE_FONTMAP
2485                          wxFontMapper::GetEncodingDescription(m_encoding).c_str()
2486 #else // !wxUSE_FONTMAP
2487                          wxString::Format(_("encoding %s"), m_encoding).c_str()
2488 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2489               );
2490         alreadyLoggingError = false;
2491     }
2492
2493     return NULL;
2494 }
2495
2496 void wxCSConv::CreateConvIfNeeded() const
2497 {
2498     if ( m_deferred )
2499     {
2500         wxCSConv *self = (wxCSConv *)this; // const_cast
2501
2502 #if wxUSE_INTL
2503         // if we don't have neither the name nor the encoding, use the default
2504         // encoding for this system
2505         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2506         {
2507             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2508         }
2509 #endif // wxUSE_INTL
2510
2511         self->m_convReal = DoCreate();
2512         self->m_deferred = false;
2513     }
2514 }
2515
2516 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2517 {
2518     CreateConvIfNeeded();
2519
2520     if (m_convReal)
2521         return m_convReal->MB2WC(buf, psz, n);
2522
2523     // latin-1 (direct)
2524     size_t len = strlen(psz);
2525
2526     if (buf)
2527     {
2528         for (size_t c = 0; c <= len; c++)
2529             buf[c] = (unsigned char)(psz[c]);
2530     }
2531
2532     return len;
2533 }
2534
2535 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2536 {
2537     CreateConvIfNeeded();
2538
2539     if (m_convReal)
2540         return m_convReal->WC2MB(buf, psz, n);
2541
2542     // latin-1 (direct)
2543     const size_t len = wxWcslen(psz);
2544     if (buf)
2545     {
2546         for (size_t c = 0; c <= len; c++)
2547         {
2548             if (psz[c] > 0xFF)
2549                 return (size_t)-1;
2550             buf[c] = (char)psz[c];
2551         }
2552     }
2553     else
2554     {
2555         for (size_t c = 0; c <= len; c++)
2556         {
2557             if (psz[c] > 0xFF)
2558                 return (size_t)-1;
2559         }
2560     }
2561
2562     return len;
2563 }
2564
2565 // ----------------------------------------------------------------------------
2566 // globals
2567 // ----------------------------------------------------------------------------
2568
2569 #ifdef __WINDOWS__
2570     static wxMBConv_win32 wxConvLibcObj;
2571 #elif defined(__WXMAC__) && !defined(__MACH__)
2572     static wxMBConv_mac wxConvLibcObj ;
2573 #else
2574     static wxMBConvLibc wxConvLibcObj;
2575 #endif
2576
2577 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2578 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2579 static wxMBConvUTF7 wxConvUTF7Obj;
2580 static wxMBConvUTF8 wxConvUTF8Obj;
2581
2582
2583 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2584 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2585 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2586 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2587 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2588 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2589
2590 #else // !wxUSE_WCHAR_T
2591
2592 // stand-ins in absence of wchar_t
2593 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2594                                 wxConvISO8859_1,
2595                                 wxConvLocal,
2596                                 wxConvUTF8;
2597
2598 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2599
2600