src/common/strconv.cpp

   1 /////////////////////////////////////////////////////////////////////////////
   2 // Name:        strconv.cpp
   3 // Purpose:     Unicode conversion classes
   4 // Author:      Ove Kaaven, Robert Roebling, Vadim Zeitlin, Vaclav Slavik,
   5 //              Ryan Norton, Fredrik Roubert (UTF7)
   6 // Modified by:
   7 // Created:     29/01/98
   8 // RCS-ID:      $Id$
   9 // Copyright:   (c) 1999 Ove Kaaven, Robert Roebling, Vaclav Slavik
  10 //              (c) 2000-2003 Vadim Zeitlin
  11 //              (c) 2004 Ryan Norton, Fredrik Roubert
  12 // Licence:     wxWindows licence
  13 /////////////////////////////////////////////////////////////////////////////
  14
  15 // ============================================================================
  16 // declarations
  17 // ============================================================================
  18
  19 // ----------------------------------------------------------------------------
  20 // headers
  21 // ----------------------------------------------------------------------------
  22
  23 #if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
  24   #pragma implementation "strconv.h"
  25 #endif
  26
  27 // For compilers that support precompilation, includes "wx.h".
  28 #include "wx/wxprec.h"
  29
  30 #ifdef __BORLANDC__
  31   #pragma hdrstop
  32 #endif
  33
  34 #ifndef WX_PRECOMP
  35     #include "wx/intl.h"
  36     #include "wx/log.h"
  37 #endif // WX_PRECOMP
  38
  39 #include "wx/strconv.h"
  40
  41 #if wxUSE_WCHAR_T
  42
  43 #ifdef __WXMSW__
  44     #include "wx/msw/private.h"
  45 #endif
  46
  47 #ifdef __WINDOWS__
  48     #include "wx/msw/missing.h"
  49 #endif
  50
  51 #ifndef __WXWINCE__
  52 #include <errno.h>
  53 #endif
  54
  55 #include <ctype.h>
  56 #include <string.h>
  57 #include <stdlib.h>
  58
  59 #if defined(__WIN32__) && !defined(__WXMICROWIN__)
  60     #define wxHAVE_WIN32_MB2WC
  61 #endif // __WIN32__ but !__WXMICROWIN__
  62
  63 // ----------------------------------------------------------------------------
  64 // headers
  65 // ----------------------------------------------------------------------------
  66
  67 #ifdef __SALFORDC__
  68     #include <clib.h>
  69 #endif
  70
  71 #ifdef HAVE_ICONV
  72     #include <iconv.h>
  73 #endif
  74
  75 #include "wx/encconv.h"
  76 #include "wx/fontmap.h"
  77 #include "wx/utils.h"
  78
  79 #ifdef __WXMAC__
  80 #include <ATSUnicode.h>
  81 #include <TextCommon.h>
  82 #include <TextEncodingConverter.h>
  83
  84 #include  "wx/mac/private.h"  // includes mac headers
  85 #endif
  86 // ----------------------------------------------------------------------------
  87 // macros
  88 // ----------------------------------------------------------------------------
  89
  90 #define BSWAP_UCS4(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT32_SWAP_ALWAYS(str[_c]); }
  91 #define BSWAP_UTF16(str, len) { unsigned _c; for (_c=0; _c<len; _c++) str[_c]=wxUINT16_SWAP_ALWAYS(str[_c]); }
  92
  93 #if SIZEOF_WCHAR_T == 4
  94     #define WC_NAME         "UCS4"
  95     #define WC_BSWAP         BSWAP_UCS4
  96     #ifdef WORDS_BIGENDIAN
  97       #define WC_NAME_BEST  "UCS-4BE"
  98     #else
  99       #define WC_NAME_BEST  "UCS-4LE"
 100     #endif
 101 #elif SIZEOF_WCHAR_T == 2
 102     #define WC_NAME         "UTF16"
 103     #define WC_BSWAP         BSWAP_UTF16
 104     #define WC_UTF16
 105     #ifdef WORDS_BIGENDIAN
 106       #define WC_NAME_BEST  "UTF-16BE"
 107     #else
 108       #define WC_NAME_BEST  "UTF-16LE"
 109     #endif
 110 #else // sizeof(wchar_t) != 2 nor 4
 111     // does this ever happen?
 112     #error "Unknown sizeof(wchar_t): please report this to wx-dev@lists.wxwindows.org"
 113 #endif
 114
 115 // ============================================================================
 116 // implementation
 117 // ============================================================================
 118
 119 // ----------------------------------------------------------------------------
 120 // UTF-16 en/decoding to/from UCS-4
 121 // ----------------------------------------------------------------------------
 122
 123
 124 static size_t encode_utf16(wxUint32 input, wxUint16 *output)
 125 {
 126     if (input<=0xffff)
 127     {
 128         if (output)
 129             *output = (wxUint16) input;
 130         return 1;
 131     }
 132     else if (input>=0x110000)
 133     {
 134         return (size_t)-1;
 135     }
 136     else
 137     {
 138         if (output)
 139         {
 140             *output++ = (wxUint16) ((input >> 10)+0xd7c0);
 141             *output = (wxUint16) ((input&0x3ff)+0xdc00);
 142         }
 143         return 2;
 144     }
 145 }
 146
 147 static size_t decode_utf16(const wxUint16* input, wxUint32& output)
 148 {
 149     if ((*input<0xd800) || (*input>0xdfff))
 150     {
 151         output = *input;
 152         return 1;
 153     }
 154     else if ((input[1]<0xdc00) || (input[1]>=0xdfff))
 155     {
 156         output = *input;
 157         return (size_t)-1;
 158     }
 159     else
 160     {
 161         output = ((input[0] - 0xd7c0) << 10) + (input[1] - 0xdc00);
 162         return 2;
 163     }
 164 }
 165
 166
 167 // ----------------------------------------------------------------------------
 168 // wxMBConv
 169 // ----------------------------------------------------------------------------
 170
 171 wxMBConv::~wxMBConv()
 172 {
 173     // nothing to do here (necessary for Darwin linking probably)
 174 }
 175
 176 const wxWCharBuffer wxMBConv::cMB2WC(const char *psz) const
 177 {
 178     if ( psz )
 179     {
 180         // calculate the length of the buffer needed first
 181         size_t nLen = MB2WC(NULL, psz, 0);
 182         if ( nLen != (size_t)-1 )
 183         {
 184             // now do the actual conversion
 185             wxWCharBuffer buf(nLen);
 186             nLen = MB2WC(buf.data(), psz, nLen + 1); // with the trailing NULL
 187             if ( nLen != (size_t)-1 )
 188             {
 189                 return buf;
 190             }
 191         }
 192     }
 193
 194     wxWCharBuffer buf((wchar_t *)NULL);
 195
 196     return buf;
 197 }
 198
 199 const wxCharBuffer wxMBConv::cWC2MB(const wchar_t *pwz) const
 200 {
 201     if ( pwz )
 202     {
 203         size_t nLen = WC2MB(NULL, pwz, 0);
 204         if ( nLen != (size_t)-1 )
 205         {
 206             wxCharBuffer buf(nLen+3);       // space for a wxUint32 trailing zero
 207             nLen = WC2MB(buf.data(), pwz, nLen + 4);
 208             if ( nLen != (size_t)-1 )
 209             {
 210                 return buf;
 211             }
 212         }
 213     }
 214
 215     wxCharBuffer buf((char *)NULL);
 216
 217     return buf;
 218 }
 219
 220 // ----------------------------------------------------------------------------
 221 // wxMBConvLibc
 222 // ----------------------------------------------------------------------------
 223
 224 size_t wxMBConvLibc::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 225 {
 226     return wxMB2WC(buf, psz, n);
 227 }
 228
 229 size_t wxMBConvLibc::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 230 {
 231     return wxWC2MB(buf, psz, n);
 232 }
 233 // ----------------------------------------------------------------------------
 234 // UTF-7
 235 // ----------------------------------------------------------------------------
 236
 237 // Implementation (C) 2004 Fredrik Roubert
 238
 239 //
 240 // BASE64 decoding table
 241 //
 242 static const unsigned char utf7unb64[] =
 243 {
 244     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 245     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 246     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 247     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 248     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 249     0xff, 0xff, 0xff, 0x3e, 0xff, 0xff, 0xff, 0x3f,
 250     0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
 251     0x3c, 0x3d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 252     0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
 253     0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
 254     0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
 255     0x17, 0x18, 0x19, 0xff, 0xff, 0xff, 0xff, 0xff,
 256     0xff, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
 257     0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
 258     0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30,
 259     0x31, 0x32, 0x33, 0xff, 0xff, 0xff, 0xff, 0xff,
 260     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 261     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 262     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 263     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 264     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 265     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 266     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 267     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 268     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 269     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 270     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 271     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 272     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 273     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 274     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
 275     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 276 };
 277
 278 size_t wxMBConvUTF7::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 279 {
 280
 281     size_t len = 0;
 282
 283     while (*psz && ((!buf) || (len < n)))
 284     {
 285         unsigned char cc = *psz++;
 286         if (cc != '+')
 287         {
 288             // plain ASCII char
 289             if (buf)
 290                 *buf++ = cc;
 291             len++;
 292         }
 293         else if (*psz == '-')
 294         {
 295             // encoded plus sign
 296             if (buf)
 297                 *buf++ = cc;
 298             len++;
 299             psz++;
 300         }
 301         else
 302         {
 303             // BASE64 encoded string
 304             bool lsb;
 305             unsigned char c;
 306             unsigned int d, l;
 307             for (lsb = false, d = 0, l = 0;
 308                 (cc = utf7unb64[(unsigned char)*psz]) != 0xff; psz++)
 309             {
 310                 d <<= 6;
 311                 d += cc;
 312                 for (l += 6; l >= 8; lsb = !lsb)
 313                 {
 314                     c = (d >> (l -= 8)) % 256;
 315                     if (lsb)
 316                     {
 317                         if (buf)
 318                             *buf++ |= c;
 319                         len ++;
 320                     }
 321                     else
 322                         if (buf)
 323                             *buf = c << 8;
 324                 }
 325             }
 326             if (*psz == '-')
 327                 psz++;
 328         }
 329     }
 330     if (buf && (len < n))
 331         *buf = 0;
 332     return len;
 333 }
 334
 335 //
 336 // BASE64 encoding table
 337 //
 338 static const unsigned char utf7enb64[] =
 339 {
 340     'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
 341     'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
 342     'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
 343     'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
 344     'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
 345     'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
 346     'w', 'x', 'y', 'z', '0', '1', '2', '3',
 347     '4', '5', '6', '7', '8', '9', '+', '/'
 348 };
 349
 350 //
 351 // UTF-7 encoding table
 352 //
 353 // 0 - Set D (directly encoded characters)
 354 // 1 - Set O (optional direct characters)
 355 // 2 - whitespace characters (optional)
 356 // 3 - special characters
 357 //
 358 static const unsigned char utf7encode[128] =
 359 {
 360     3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
 361     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 362     2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 3,
 363     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
 364     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 365     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
 366     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 367     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3
 368 };
 369
 370 size_t wxMBConvUTF7::WC2MB(char *buf, const wchar_t
 371 *psz, size_t n) const
 372 {
 373
 374
 375     size_t len = 0;
 376
 377     while (*psz && ((!buf) || (len < n)))
 378     {
 379         wchar_t cc = *psz++;
 380         if (cc < 0x80 && utf7encode[cc] < 1)
 381         {
 382             // plain ASCII char
 383             if (buf)
 384                 *buf++ = (char)cc;
 385             len++;
 386         }
 387 #ifndef WC_UTF16
 388         else if (cc > 0xffff)
 389         {
 390             // no surrogate pair generation (yet?)
 391             return (size_t)-1;
 392         }
 393 #endif
 394         else
 395         {
 396             if (buf)
 397                 *buf++ = '+';
 398             len++;
 399             if (cc != '+')
 400             {
 401                 // BASE64 encode string
 402                 unsigned int lsb, d, l;
 403                 for (d = 0, l = 0;; psz++)
 404                 {
 405                     for (lsb = 0; lsb < 2; lsb ++)
 406                     {
 407                         d <<= 8;
 408                         d += lsb ? cc & 0xff : (cc & 0xff00) >> 8;
 409
 410                         for (l += 8; l >= 6; )
 411                         {
 412                             l -= 6;
 413                             if (buf)
 414                                 *buf++ = utf7enb64[(d >> l) % 64];
 415                             len++;
 416                         }
 417                     }
 418                     cc = *psz;
 419                     if (!(cc) || (cc < 0x80 && utf7encode[cc] < 1))
 420                         break;
 421                 }
 422                 if (l != 0)
 423                 {
 424                     if (buf)
 425                         *buf++ = utf7enb64[((d % 16) << (6 - l)) % 64];
 426                     len++;
 427                 }
 428             }
 429             if (buf)
 430                 *buf++ = '-';
 431             len++;
 432         }
 433     }
 434     if (buf && (len < n))
 435         *buf = 0;
 436     return len;
 437 }
 438
 439 // ----------------------------------------------------------------------------
 440 // UTF-8
 441 // ----------------------------------------------------------------------------
 442
 443 static wxUint32 utf8_max[]=
 444     { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff, 0xffffffff };
 445
 446 size_t wxMBConvUTF8::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 447 {
 448     size_t len = 0;
 449
 450     while (*psz && ((!buf) || (len < n)))
 451     {
 452         unsigned char cc = *psz++, fc = cc;
 453         unsigned cnt;
 454         for (cnt = 0; fc & 0x80; cnt++)
 455             fc <<= 1;
 456         if (!cnt)
 457         {
 458             // plain ASCII char
 459             if (buf)
 460                 *buf++ = cc;
 461             len++;
 462         }
 463         else
 464         {
 465             cnt--;
 466             if (!cnt)
 467             {
 468                 // invalid UTF-8 sequence
 469                 return (size_t)-1;
 470             }
 471             else
 472             {
 473                 unsigned ocnt = cnt - 1;
 474                 wxUint32 res = cc & (0x3f >> cnt);
 475                 while (cnt--)
 476                 {
 477                     cc = *psz++;
 478                     if ((cc & 0xC0) != 0x80)
 479                     {
 480                         // invalid UTF-8 sequence
 481                         return (size_t)-1;
 482                     }
 483                     res = (res << 6) | (cc & 0x3f);
 484                 }
 485                 if (res <= utf8_max[ocnt])
 486                 {
 487                     // illegal UTF-8 encoding
 488                     return (size_t)-1;
 489                 }
 490 #ifdef WC_UTF16
 491                 // cast is ok because wchar_t == wxUuint16 if WC_UTF16
 492                 size_t pa = encode_utf16(res, (wxUint16 *)buf);
 493                 if (pa == (size_t)-1)
 494                   return (size_t)-1;
 495                 if (buf)
 496                     buf += pa;
 497                 len += pa;
 498 #else // !WC_UTF16
 499                 if (buf)
 500                     *buf++ = res;
 501                 len++;
 502 #endif // WC_UTF16/!WC_UTF16
 503             }
 504         }
 505     }
 506     if (buf && (len < n))
 507         *buf = 0;
 508     return len;
 509 }
 510
 511 size_t wxMBConvUTF8::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 512 {
 513     size_t len = 0;
 514
 515     while (*psz && ((!buf) || (len < n)))
 516     {
 517         wxUint32 cc;
 518 #ifdef WC_UTF16
 519         // cast is ok for WC_UTF16
 520         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 521         psz += (pa == (size_t)-1) ? 1 : pa;
 522 #else
 523         cc=(*psz++) & 0x7fffffff;
 524 #endif
 525         unsigned cnt;
 526         for (cnt = 0; cc > utf8_max[cnt]; cnt++) {}
 527         if (!cnt)
 528         {
 529             // plain ASCII char
 530             if (buf)
 531                 *buf++ = (char) cc;
 532             len++;
 533         }
 534
 535         else
 536         {
 537             len += cnt + 1;
 538             if (buf)
 539             {
 540                 *buf++ = (char) ((-128 >> cnt) | ((cc >> (cnt * 6)) & (0x3f >> cnt)));
 541                 while (cnt--)
 542                     *buf++ = (char) (0x80 | ((cc >> (cnt * 6)) & 0x3f));
 543             }
 544         }
 545     }
 546
 547     if (buf && (len<n)) *buf = 0;
 548
 549     return len;
 550 }
 551
 552
 553
 554
 555 // ----------------------------------------------------------------------------
 556 // UTF-16
 557 // ----------------------------------------------------------------------------
 558
 559 #ifdef WORDS_BIGENDIAN
 560     #define wxMBConvUTF16straight wxMBConvUTF16BE
 561     #define wxMBConvUTF16swap     wxMBConvUTF16LE
 562 #else
 563     #define wxMBConvUTF16swap     wxMBConvUTF16BE
 564     #define wxMBConvUTF16straight wxMBConvUTF16LE
 565 #endif
 566
 567
 568 #ifdef WC_UTF16
 569
 570 // copy 16bit MB to 16bit String
 571 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 572 {
 573     size_t len=0;
 574
 575     while (*(wxUint16*)psz && (!buf || len < n))
 576     {
 577         if (buf)
 578             *buf++ = *(wxUint16*)psz;
 579         len++;
 580
 581         psz += sizeof(wxUint16);
 582     }
 583     if (buf && len<n)   *buf=0;
 584
 585     return len;
 586 }
 587
 588
 589 // copy 16bit String to 16bit MB
 590 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 591 {
 592     size_t len=0;
 593
 594     while (*psz && (!buf || len < n))
 595     {
 596         if (buf)
 597         {
 598             *(wxUint16*)buf = *psz;
 599             buf += sizeof(wxUint16);
 600         }
 601         len += sizeof(wxUint16);
 602         psz++;
 603     }
 604     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 605
 606     return len;
 607 }
 608
 609
 610 // swap 16bit MB to 16bit String
 611 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 612 {
 613     size_t len=0;
 614
 615     while (*(wxUint16*)psz && (!buf || len < n))
 616     {
 617         if (buf)
 618         {
 619             ((char *)buf)[0] = psz[1];
 620             ((char *)buf)[1] = psz[0];
 621             buf++;
 622         }
 623         len++;
 624         psz += sizeof(wxUint16);
 625     }
 626     if (buf && len<n)   *buf=0;
 627
 628     return len;
 629 }
 630
 631
 632 // swap 16bit MB to 16bit String
 633 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 634 {
 635     size_t len=0;
 636
 637     while (*psz && (!buf || len < n))
 638     {
 639         if (buf)
 640         {
 641             *buf++ = ((char*)psz)[1];
 642             *buf++ = ((char*)psz)[0];
 643         }
 644         len += sizeof(wxUint16);
 645         psz++;
 646     }
 647     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 648
 649     return len;
 650 }
 651
 652
 653 #else // WC_UTF16
 654
 655
 656 // copy 16bit MB to 32bit String
 657 size_t wxMBConvUTF16straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 658 {
 659     size_t len=0;
 660
 661     while (*(wxUint16*)psz && (!buf || len < n))
 662     {
 663         wxUint32 cc;
 664         size_t pa=decode_utf16((wxUint16*)psz, cc);
 665         if (pa == (size_t)-1)
 666             return pa;
 667
 668         if (buf)
 669             *buf++ = cc;
 670         len++;
 671         psz += pa * sizeof(wxUint16);
 672     }
 673     if (buf && len<n)   *buf=0;
 674
 675     return len;
 676 }
 677
 678
 679 // copy 32bit String to 16bit MB
 680 size_t wxMBConvUTF16straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 681 {
 682     size_t len=0;
 683
 684     while (*psz && (!buf || len < n))
 685     {
 686         wxUint16 cc[2];
 687         size_t pa=encode_utf16(*psz, cc);
 688
 689         if (pa == (size_t)-1)
 690             return pa;
 691
 692         if (buf)
 693         {
 694             *(wxUint16*)buf = cc[0];
 695             buf += sizeof(wxUint16);
 696             if (pa > 1)
 697             {
 698                 *(wxUint16*)buf = cc[1];
 699                 buf += sizeof(wxUint16);
 700             }
 701         }
 702
 703         len += pa*sizeof(wxUint16);
 704         psz++;
 705     }
 706     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 707
 708     return len;
 709 }
 710
 711
 712 // swap 16bit MB to 32bit String
 713 size_t wxMBConvUTF16swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 714 {
 715     size_t len=0;
 716
 717     while (*(wxUint16*)psz && (!buf || len < n))
 718     {
 719         wxUint32 cc;
 720         char tmp[4];
 721         tmp[0]=psz[1];  tmp[1]=psz[0];
 722         tmp[2]=psz[3];  tmp[3]=psz[2];
 723
 724         size_t pa=decode_utf16((wxUint16*)tmp, cc);
 725         if (pa == (size_t)-1)
 726             return pa;
 727
 728         if (buf)
 729             *buf++ = cc;
 730
 731         len++;
 732         psz += pa * sizeof(wxUint16);
 733     }
 734     if (buf && len<n)   *buf=0;
 735
 736     return len;
 737 }
 738
 739
 740 // swap 32bit String to 16bit MB
 741 size_t wxMBConvUTF16swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 742 {
 743     size_t len=0;
 744
 745     while (*psz && (!buf || len < n))
 746     {
 747         wxUint16 cc[2];
 748         size_t pa=encode_utf16(*psz, cc);
 749
 750         if (pa == (size_t)-1)
 751             return pa;
 752
 753         if (buf)
 754         {
 755             *buf++ = ((char*)cc)[1];
 756             *buf++ = ((char*)cc)[0];
 757             if (pa > 1)
 758             {
 759                 *buf++ = ((char*)cc)[3];
 760                 *buf++ = ((char*)cc)[2];
 761             }
 762         }
 763
 764         len += pa*sizeof(wxUint16);
 765         psz++;
 766     }
 767     if (buf && len<=n-sizeof(wxUint16))   *(wxUint16*)buf=0;
 768
 769     return len;
 770 }
 771
 772 #endif // WC_UTF16
 773
 774
 775 // ----------------------------------------------------------------------------
 776 // UTF-32
 777 // ----------------------------------------------------------------------------
 778
 779 #ifdef WORDS_BIGENDIAN
 780 #define wxMBConvUTF32straight  wxMBConvUTF32BE
 781 #define wxMBConvUTF32swap      wxMBConvUTF32LE
 782 #else
 783 #define wxMBConvUTF32swap      wxMBConvUTF32BE
 784 #define wxMBConvUTF32straight  wxMBConvUTF32LE
 785 #endif
 786
 787
 788 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32LE) wxConvUTF32LE;
 789 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF32BE) wxConvUTF32BE;
 790
 791
 792 #ifdef WC_UTF16
 793
 794 // copy 32bit MB to 16bit String
 795 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 796 {
 797     size_t len=0;
 798
 799     while (*(wxUint32*)psz && (!buf || len < n))
 800     {
 801         wxUint16 cc[2];
 802
 803         size_t pa=encode_utf16(*(wxUint32*)psz, cc);
 804         if (pa == (size_t)-1)
 805             return pa;
 806
 807         if (buf)
 808         {
 809             *buf++ = cc[0];
 810             if (pa > 1)
 811                 *buf++ = cc[1];
 812         }
 813         len += pa;
 814         psz += sizeof(wxUint32);
 815     }
 816     if (buf && len<n)   *buf=0;
 817
 818     return len;
 819 }
 820
 821
 822 // copy 16bit String to 32bit MB
 823 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 824 {
 825     size_t len=0;
 826
 827     while (*psz && (!buf || len < n))
 828     {
 829         wxUint32 cc;
 830
 831         // cast is ok for WC_UTF16
 832         size_t pa = decode_utf16((const wxUint16 *)psz, cc);
 833         if (pa == (size_t)-1)
 834             return pa;
 835
 836         if (buf)
 837         {
 838             *(wxUint32*)buf = cc;
 839             buf += sizeof(wxUint32);
 840         }
 841         len += sizeof(wxUint32);
 842         psz += pa;
 843     }
 844
 845     if (buf && len<=n-sizeof(wxUint32))
 846         *(wxUint32*)buf=0;
 847
 848     return len;
 849 }
 850
 851
 852
 853 // swap 32bit MB to 16bit String
 854 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 855 {
 856     size_t len=0;
 857
 858     while (*(wxUint32*)psz && (!buf || len < n))
 859     {
 860         char tmp[4];
 861         tmp[0] = psz[3];   tmp[1] = psz[2];
 862         tmp[2] = psz[1];   tmp[3] = psz[0];
 863
 864
 865         wxUint16 cc[2];
 866
 867         size_t pa=encode_utf16(*(wxUint32*)tmp, cc);
 868         if (pa == (size_t)-1)
 869             return pa;
 870
 871         if (buf)
 872         {
 873             *buf++ = cc[0];
 874             if (pa > 1)
 875                 *buf++ = cc[1];
 876         }
 877         len += pa;
 878         psz += sizeof(wxUint32);
 879     }
 880
 881     if (buf && len<n)
 882         *buf=0;
 883
 884     return len;
 885 }
 886
 887
 888 // swap 16bit String to 32bit MB
 889 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 890 {
 891     size_t len=0;
 892
 893     while (*psz && (!buf || len < n))
 894     {
 895         char cc[4];
 896
 897         // cast is ok for WC_UTF16
 898         size_t pa=decode_utf16((const wxUint16 *)psz, *(wxUint32*)cc);
 899         if (pa == (size_t)-1)
 900             return pa;
 901
 902         if (buf)
 903         {
 904             *buf++ = cc[3];
 905             *buf++ = cc[2];
 906             *buf++ = cc[1];
 907             *buf++ = cc[0];
 908         }
 909         len += sizeof(wxUint32);
 910         psz += pa;
 911     }
 912
 913     if (buf && len<=n-sizeof(wxUint32))
 914         *(wxUint32*)buf=0;
 915
 916     return len;
 917 }
 918
 919 #else // WC_UTF16
 920
 921
 922 // copy 32bit MB to 32bit String
 923 size_t wxMBConvUTF32straight::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 924 {
 925     size_t len=0;
 926
 927     while (*(wxUint32*)psz && (!buf || len < n))
 928     {
 929         if (buf)
 930             *buf++ = *(wxUint32*)psz;
 931         len++;
 932         psz += sizeof(wxUint32);
 933     }
 934
 935     if (buf && len<n)
 936         *buf=0;
 937
 938     return len;
 939 }
 940
 941
 942 // copy 32bit String to 32bit MB
 943 size_t wxMBConvUTF32straight::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 944 {
 945     size_t len=0;
 946
 947     while (*psz && (!buf || len < n))
 948     {
 949         if (buf)
 950         {
 951             *(wxUint32*)buf = *psz;
 952             buf += sizeof(wxUint32);
 953         }
 954
 955         len += sizeof(wxUint32);
 956         psz++;
 957     }
 958
 959     if (buf && len<=n-sizeof(wxUint32))
 960         *(wxUint32*)buf=0;
 961
 962     return len;
 963 }
 964
 965
 966 // swap 32bit MB to 32bit String
 967 size_t wxMBConvUTF32swap::MB2WC(wchar_t *buf, const char *psz, size_t n) const
 968 {
 969     size_t len=0;
 970
 971     while (*(wxUint32*)psz && (!buf || len < n))
 972     {
 973         if (buf)
 974         {
 975             ((char *)buf)[0] = psz[3];
 976             ((char *)buf)[1] = psz[2];
 977             ((char *)buf)[2] = psz[1];
 978             ((char *)buf)[3] = psz[0];
 979             buf++;
 980         }
 981         len++;
 982         psz += sizeof(wxUint32);
 983     }
 984
 985     if (buf && len<n)
 986         *buf=0;
 987
 988     return len;
 989 }
 990
 991
 992 // swap 32bit String to 32bit MB
 993 size_t wxMBConvUTF32swap::WC2MB(char *buf, const wchar_t *psz, size_t n) const
 994 {
 995     size_t len=0;
 996
 997     while (*psz && (!buf || len < n))
 998     {
 999         if (buf)
1000         {
1001             *buf++ = ((char *)psz)[3];
1002             *buf++ = ((char *)psz)[2];
1003             *buf++ = ((char *)psz)[1];
1004             *buf++ = ((char *)psz)[0];
1005         }
1006         len += sizeof(wxUint32);
1007         psz++;
1008     }
1009
1010     if (buf && len<=n-sizeof(wxUint32))
1011         *(wxUint32*)buf=0;
1012
1013     return len;
1014 }
1015
1016
1017 #endif // WC_UTF16
1018
1019
1020 // ============================================================================
1021 // The classes doing conversion using the iconv_xxx() functions
1022 // ============================================================================
1023
1024 #ifdef HAVE_ICONV
1025
1026 // VS: glibc 2.1.3 is broken in that iconv() conversion to/from UCS4 fails with E2BIG
1027 //     if output buffer is _exactly_ as big as needed. Such case is (unless there's
1028 //     yet another bug in glibc) the only case when iconv() returns with (size_t)-1
1029 //     (which means error) and says there are 0 bytes left in the input buffer --
1030 //     when _real_ error occurs, bytes-left-in-input buffer is non-zero. Hence,
1031 //     this alternative test for iconv() failure.
1032 //     [This bug does not appear in glibc 2.2.]
1033 #if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ <= 1
1034 #define ICONV_FAILED(cres, bufLeft) ((cres == (size_t)-1) && \
1035                                      (errno != E2BIG || bufLeft != 0))
1036 #else
1037 #define ICONV_FAILED(cres, bufLeft)  (cres == (size_t)-1)
1038 #endif
1039
1040 #define ICONV_CHAR_CAST(x)  ((ICONV_CONST char **)(x))
1041
1042 // ----------------------------------------------------------------------------
1043 // wxMBConv_iconv: encapsulates an iconv character set
1044 // ----------------------------------------------------------------------------
1045
1046 class wxMBConv_iconv : public wxMBConv
1047 {
1048 public:
1049     wxMBConv_iconv(const wxChar *name);
1050     virtual ~wxMBConv_iconv();
1051
1052     virtual size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const;
1053     virtual size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const;
1054
1055     bool IsOk() const
1056         { return (m2w != (iconv_t)-1) && (w2m != (iconv_t)-1); }
1057
1058 protected:
1059     // the iconv handlers used to translate from multibyte to wide char and in
1060     // the other direction
1061     iconv_t m2w,
1062             w2m;
1063
1064 private:
1065     // the name (for iconv_open()) of a wide char charset -- if none is
1066     // available on this machine, it will remain NULL
1067     static const char *ms_wcCharsetName;
1068
1069     // true if the wide char encoding we use (i.e. ms_wcCharsetName) has
1070     // different endian-ness than the native one
1071     static bool ms_wcNeedsSwap;
1072 };
1073
1074 const char *wxMBConv_iconv::ms_wcCharsetName = NULL;
1075 bool wxMBConv_iconv::ms_wcNeedsSwap = false;
1076
1077 wxMBConv_iconv::wxMBConv_iconv(const wxChar *name)
1078 {
1079     // Do it the hard way
1080     char cname[100];
1081     for (size_t i = 0; i < wxStrlen(name)+1; i++)
1082         cname[i] = (char) name[i];
1083
1084     // check for charset that represents wchar_t:
1085     if (ms_wcCharsetName == NULL)
1086     {
1087         ms_wcNeedsSwap = false;
1088
1089         // try charset with explicit bytesex info (e.g. "UCS-4LE"):
1090         ms_wcCharsetName = WC_NAME_BEST;
1091         m2w = iconv_open(ms_wcCharsetName, cname);
1092
1093         if (m2w == (iconv_t)-1)
1094         {
1095             // try charset w/o bytesex info (e.g. "UCS4")
1096             // and check for bytesex ourselves:
1097             ms_wcCharsetName = WC_NAME;
1098             m2w = iconv_open(ms_wcCharsetName, cname);
1099
1100             // last bet, try if it knows WCHAR_T pseudo-charset
1101             if (m2w == (iconv_t)-1)
1102             {
1103                 ms_wcCharsetName = "WCHAR_T";
1104                 m2w = iconv_open(ms_wcCharsetName, cname);
1105             }
1106
1107             if (m2w != (iconv_t)-1)
1108             {
1109                 char    buf[2], *bufPtr;
1110                 wchar_t wbuf[2], *wbufPtr;
1111                 size_t  insz, outsz;
1112                 size_t  res;
1113
1114                 buf[0] = 'A';
1115                 buf[1] = 0;
1116                 wbuf[0] = 0;
1117                 insz = 2;
1118                 outsz = SIZEOF_WCHAR_T * 2;
1119                 wbufPtr = wbuf;
1120                 bufPtr = buf;
1121
1122                 res = iconv(m2w, ICONV_CHAR_CAST(&bufPtr), &insz,
1123                             (char**)&wbufPtr, &outsz);
1124
1125                 if (ICONV_FAILED(res, insz))
1126                 {
1127                     ms_wcCharsetName = NULL;
1128                     wxLogLastError(wxT("iconv"));
1129                     wxLogError(_("Conversion to charset '%s' doesn't work."), name);
1130                 }
1131                 else
1132                 {
1133                     ms_wcNeedsSwap = wbuf[0] != (wchar_t)buf[0];
1134                 }
1135             }
1136             else
1137             {
1138                 ms_wcCharsetName = NULL;
1139
1140                 // VS: we must not output an error here, since wxWidgets will safely
1141                 //     fall back to using wxEncodingConverter.
1142                 wxLogTrace(wxT("strconv"), wxT("Impossible to convert to/from charset '%s' with iconv, falling back to wxEncodingConverter."), name);
1143                 //wxLogError(
1144             }
1145         }
1146         wxLogTrace(wxT("strconv"), wxT("wchar_t charset is '%s', needs swap: %i"), ms_wcCharsetName, ms_wcNeedsSwap);
1147     }
1148     else // we already have ms_wcCharsetName
1149     {
1150         m2w = iconv_open(ms_wcCharsetName, cname);
1151     }
1152
1153     // NB: don't ever pass NULL to iconv_open(), it may crash!
1154     if ( ms_wcCharsetName )
1155     {
1156         w2m = iconv_open( cname, ms_wcCharsetName);
1157     }
1158     else
1159     {
1160         w2m = (iconv_t)-1;
1161     }
1162 }
1163
1164 wxMBConv_iconv::~wxMBConv_iconv()
1165 {
1166     if ( m2w != (iconv_t)-1 )
1167         iconv_close(m2w);
1168     if ( w2m != (iconv_t)-1 )
1169         iconv_close(w2m);
1170 }
1171
1172 size_t wxMBConv_iconv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
1173 {
1174     size_t inbuf = strlen(psz);
1175     size_t outbuf = n * SIZEOF_WCHAR_T;
1176     size_t res, cres;
1177     // VS: Use these instead of psz, buf because iconv() modifies its arguments:
1178     wchar_t *bufPtr = buf;
1179     const char *pszPtr = psz;
1180
1181     if (buf)
1182     {
1183         // have destination buffer, convert there
1184         cres = iconv(m2w,
1185                      ICONV_CHAR_CAST(&pszPtr), &inbuf,
1186                      (char**)&bufPtr, &outbuf);
1187         res = n - (outbuf / SIZEOF_WCHAR_T);
1188
1189         if (ms_wcNeedsSwap)
1190         {
1191             // convert to native endianness
1192             WC_BSWAP(buf /* _not_ bufPtr */, res)
1193         }
1194
1195         // NB: iconv was given only strlen(psz) characters on input, and so
1196         //     it couldn't convert the trailing zero. Let's do it ourselves
1197         //     if there's some room left for it in the output buffer.
1198         if (res < n)
1199             buf[res] = 0;
1200     }
1201     else
1202     {
1203         // no destination buffer... convert using temp buffer
1204         // to calculate destination buffer requirement
1205         wchar_t tbuf[8];
1206         res = 0;
1207         do {
1208             bufPtr = tbuf;
1209             outbuf = 8*SIZEOF_WCHAR_T;
1210
1211             cres = iconv(m2w,
1212                          ICONV_CHAR_CAST(&pszPtr), &inbuf,
1213                          (char**)&bufPtr, &outbuf );
1214
1215             res += 8-(outbuf/SIZEOF_WCHAR_T);
1216         } while ((cres==(size_t)-1) && (errno==E2BIG));
1217     }
1218
1219     if (ICONV_FAILED(cres, inbuf))
1220     {
1221         //VS: it is ok if iconv fails, hence trace only
1222         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1223         return (size_t)-1;
1224     }
1225
1226     return res;
1227 }
1228
1229 size_t wxMBConv_iconv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
1230 {
1231     size_t inbuf = wxWcslen(psz) * SIZEOF_WCHAR_T;
1232     size_t outbuf = n;
1233     size_t res, cres;
1234
1235     wchar_t *tmpbuf = 0;
1236
1237     if (ms_wcNeedsSwap)
1238     {
1239         // need to copy to temp buffer to switch endianness
1240         // this absolutely doesn't rock!
1241         // (no, doing WC_BSWAP twice on the original buffer won't help, as it
1242         //  could be in read-only memory, or be accessed in some other thread)
1243         tmpbuf=(wchar_t*)malloc((inbuf+1)*SIZEOF_WCHAR_T);
1244         memcpy(tmpbuf,psz,(inbuf+1)*SIZEOF_WCHAR_T);
1245         WC_BSWAP(tmpbuf, inbuf)
1246         psz=tmpbuf;
1247     }
1248
1249     if (buf)
1250     {
1251         // have destination buffer, convert there
1252         cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1253
1254         res = n-outbuf;
1255
1256         // NB: iconv was given only wcslen(psz) characters on input, and so
1257         //     it couldn't convert the trailing zero. Let's do it ourselves
1258         //     if there's some room left for it in the output buffer.
1259         if (res < n)
1260             buf[0] = 0;
1261     }
1262     else
1263     {
1264         // no destination buffer... convert using temp buffer
1265         // to calculate destination buffer requirement
1266         char tbuf[16];
1267         res = 0;
1268         do {
1269             buf = tbuf; outbuf = 16;
1270
1271             cres = iconv( w2m, ICONV_CHAR_CAST(&psz), &inbuf, &buf, &outbuf );
1272
1273             res += 16 - outbuf;
1274         } while ((cres==(size_t)-1) && (errno==E2BIG));
1275     }
1276
1277     if (ms_wcNeedsSwap)
1278     {
1279         free(tmpbuf);
1280     }
1281
1282     if (ICONV_FAILED(cres, inbuf))
1283     {
1284         //VS: it is ok if iconv fails, hence trace only
1285         wxLogTrace(wxT("strconv"), wxT("iconv failed: %s"), wxSysErrorMsg(wxSysErrorCode()));
1286         return (size_t)-1;
1287     }
1288
1289     return res;
1290 }
1291
1292 #endif // HAVE_ICONV
1293
1294
1295 // ============================================================================
1296 // Win32 conversion classes
1297 // ============================================================================
1298
1299 #ifdef wxHAVE_WIN32_MB2WC
1300
1301 // from utils.cpp
1302 #if wxUSE_FONTMAP
1303 extern WXDLLIMPEXP_BASE long wxCharsetToCodepage(const wxChar *charset);
1304 extern WXDLLIMPEXP_BASE long wxEncodingToCodepage(wxFontEncoding encoding);
1305 #endif
1306
1307 class wxMBConv_win32 : public wxMBConv
1308 {
1309 public:
1310     wxMBConv_win32()
1311     {
1312         m_CodePage = CP_ACP;
1313     }
1314
1315 #if wxUSE_FONTMAP
1316     wxMBConv_win32(const wxChar* name)
1317     {
1318         m_CodePage = wxCharsetToCodepage(name);
1319     }
1320
1321     wxMBConv_win32(wxFontEncoding encoding)
1322     {
1323         m_CodePage = wxEncodingToCodepage(encoding);
1324     }
1325 #endif
1326
1327     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
1328     {
1329         // note that we have to use MB_ERR_INVALID_CHARS flag as it without it
1330         // the behaviour is not compatible with the Unix version (using iconv)
1331         // and break the library itself, e.g. wxTextInputStream::NextChar()
1332         // wouldn't work if reading an incomplete MB char didn't result in an
1333         // error
1334         const size_t len = ::MultiByteToWideChar
1335                              (
1336                                 m_CodePage,     // code page
1337                                 MB_ERR_INVALID_CHARS, // flags: fall on error
1338                                 psz,            // input string
1339                                 -1,             // its length (NUL-terminated)
1340                                 buf,            // output string
1341                                 buf ? n : 0     // size of output buffer
1342                              );
1343
1344         // note that it returns count of written chars for buf != NULL and size
1345         // of the needed buffer for buf == NULL so in either case the length of
1346         // the string (which never includes the terminating NUL) is one less
1347         return len ? len - 1 : (size_t)-1;
1348     }
1349
1350     size_t WC2MB(char *buf, const wchar_t *pwz, size_t n) const
1351     {
1352         /*
1353             we have a problem here: by default, WideCharToMultiByte() may
1354             replace characters unrepresentable in the target code page with bad
1355             quality approximations such as turning "1/2" symbol (U+00BD) into
1356             "1" for the code pages which don't have it and we, obviously, want
1357             to avoid this at any price
1358
1359             the trouble is that this function does it _silently_, i.e. it won't
1360             even tell us whether it did or not... Win98/2000 and higher provide
1361             WC_NO_BEST_FIT_CHARS but it doesn't work for the older systems and
1362             we have to resort to a round trip, i.e. check that converting back
1363             results in the same string -- this is, of course, expensive but
1364             otherwise we simply can't be sure to not garble the data.
1365          */
1366
1367         // determine if we can rely on WC_NO_BEST_FIT_CHARS: according to MSDN
1368         // it doesn't work with CJK encodings (which we test for rather roughly
1369         // here...) nor with UTF-7/8 nor, of course, with Windows versions not
1370         // supporting it
1371         BOOL usedDef wxDUMMY_INITIALIZE(false);
1372         BOOL *pUsedDef;
1373         int flags;
1374         if ( CanUseNoBestFit() && m_CodePage < 50000 )
1375         {
1376             // it's our lucky day
1377             flags = WC_NO_BEST_FIT_CHARS;
1378             pUsedDef = &usedDef;
1379         }
1380         else // old system or unsupported encoding
1381         {
1382             flags = 0;
1383             pUsedDef = NULL;
1384         }
1385
1386         const size_t len = ::WideCharToMultiByte
1387                              (
1388                                 m_CodePage,     // code page
1389                                 flags,          // either none or no best fit
1390                                 pwz,            // input string
1391                                 -1,             // it is (wide) NUL-terminated
1392                                 buf,            // output buffer
1393                                 buf ? n : 0,    // and its size
1394                                 NULL,           // default "replacement" char
1395                                 pUsedDef        // [out] was it used?
1396                              );
1397
1398         if ( !len )
1399         {
1400             // function totally failed
1401             return (size_t)-1;
1402         }
1403
1404         // if we were really converting, check if we succeeded
1405         if ( buf )
1406         {
1407             if ( flags )
1408             {
1409                 // check if the conversion failed, i.e. if any replacements
1410                 // were done
1411                 if ( usedDef )
1412                     return (size_t)-1;
1413             }
1414             else // we must resort to double tripping...
1415             {
1416                 wxWCharBuffer wcBuf(n);
1417                 if ( MB2WC(wcBuf.data(), buf, n) == (size_t)-1 ||
1418                         wcscmp(wcBuf, pwz) != 0 )
1419                 {
1420                     // we didn't obtain the same thing we started from, hence
1421                     // the conversion was lossy and we consider that it failed
1422                     return (size_t)-1;
1423                 }
1424             }
1425         }
1426
1427         // see the comment above for the reason of "len - 1"
1428         return len - 1;
1429     }
1430
1431     bool IsOk() const { return m_CodePage != -1; }
1432
1433 private:
1434     static bool CanUseNoBestFit()
1435     {
1436         static int s_isWin98Or2k = -1;
1437
1438         if ( s_isWin98Or2k == -1 )
1439         {
1440             int verMaj, verMin;
1441             switch ( wxGetOsVersion(&verMaj, &verMin) )
1442             {
1443                 case wxWIN95:
1444                     s_isWin98Or2k = verMaj >= 4 && verMin >= 10;
1445                     break;
1446
1447                 case wxWINDOWS_NT:
1448                     s_isWin98Or2k = verMaj >= 5;
1449                     break;
1450
1451                 default:
1452                     // unknown, be conseravtive by default
1453                     s_isWin98Or2k = 0;
1454             }
1455
1456             wxASSERT_MSG( s_isWin98Or2k != -1, _T("should be set above") );
1457         }
1458
1459         return s_isWin98Or2k == 1;
1460     }
1461
1462     long m_CodePage;
1463 };
1464
1465 #endif // wxHAVE_WIN32_MB2WC
1466
1467 // ============================================================================
1468 // Cocoa conversion classes
1469 // ============================================================================
1470
1471 #if defined(__WXCOCOA__)
1472
1473 // RN:  There is no UTF-32 support in either Core Foundation or
1474 // Cocoa.  Strangely enough, internally Core Foundation uses
1475 // UTF 32 internally quite a bit - its just not public (yet).
1476
1477 #include <CoreFoundation/CFString.h>
1478 #include <CoreFoundation/CFStringEncodingExt.h>
1479
1480 CFStringEncoding wxCFStringEncFromFontEnc(wxFontEncoding encoding)
1481 {
1482     CFStringEncoding enc = 0 ;
1483     if ( encoding == wxFONTENCODING_DEFAULT )
1484     {
1485 #if wxUSE_GUI
1486         encoding = wxFont::GetDefaultEncoding() ;
1487 #else
1488         encoding = wxLocale::GetSystemEncoding() ;
1489 #endif
1490     }
1491     else switch( encoding)
1492     {
1493         case wxFONTENCODING_ISO8859_1 :
1494             enc = kCFStringEncodingISOLatin1 ;
1495             break ;
1496         case wxFONTENCODING_ISO8859_2 :
1497             enc = kCFStringEncodingISOLatin2;
1498             break ;
1499         case wxFONTENCODING_ISO8859_3 :
1500             enc = kCFStringEncodingISOLatin3 ;
1501             break ;
1502         case wxFONTENCODING_ISO8859_4 :
1503             enc = kCFStringEncodingISOLatin4;
1504             break ;
1505         case wxFONTENCODING_ISO8859_5 :
1506             enc = kCFStringEncodingISOLatinCyrillic;
1507             break ;
1508         case wxFONTENCODING_ISO8859_6 :
1509             enc = kCFStringEncodingISOLatinArabic;
1510             break ;
1511         case wxFONTENCODING_ISO8859_7 :
1512             enc = kCFStringEncodingISOLatinGreek;
1513             break ;
1514         case wxFONTENCODING_ISO8859_8 :
1515             enc = kCFStringEncodingISOLatinHebrew;
1516             break ;
1517         case wxFONTENCODING_ISO8859_9 :
1518             enc = kCFStringEncodingISOLatin5;
1519             break ;
1520         case wxFONTENCODING_ISO8859_10 :
1521             enc = kCFStringEncodingISOLatin6;
1522             break ;
1523         case wxFONTENCODING_ISO8859_11 :
1524             enc = kCFStringEncodingISOLatinThai;
1525             break ;
1526         case wxFONTENCODING_ISO8859_13 :
1527             enc = kCFStringEncodingISOLatin7;
1528             break ;
1529         case wxFONTENCODING_ISO8859_14 :
1530             enc = kCFStringEncodingISOLatin8;
1531             break ;
1532         case wxFONTENCODING_ISO8859_15 :
1533             enc = kCFStringEncodingISOLatin9;
1534             break ;
1535
1536         case wxFONTENCODING_KOI8 :
1537             enc = kCFStringEncodingKOI8_R;
1538             break ;
1539         case wxFONTENCODING_ALTERNATIVE : // MS-DOS CP866
1540             enc = kCFStringEncodingDOSRussian;
1541             break ;
1542
1543 //      case wxFONTENCODING_BULGARIAN :
1544 //          enc = ;
1545 //          break ;
1546
1547         case wxFONTENCODING_CP437 :
1548             enc =kCFStringEncodingDOSLatinUS ;
1549             break ;
1550         case wxFONTENCODING_CP850 :
1551             enc = kCFStringEncodingDOSLatin1;
1552             break ;
1553         case wxFONTENCODING_CP852 :
1554             enc = kCFStringEncodingDOSLatin2;
1555             break ;
1556         case wxFONTENCODING_CP855 :
1557             enc = kCFStringEncodingDOSCyrillic;
1558             break ;
1559         case wxFONTENCODING_CP866 :
1560             enc =kCFStringEncodingDOSRussian ;
1561             break ;
1562         case wxFONTENCODING_CP874 :
1563             enc = kCFStringEncodingDOSThai;
1564             break ;
1565         case wxFONTENCODING_CP932 :
1566             enc = kCFStringEncodingDOSJapanese;
1567             break ;
1568         case wxFONTENCODING_CP936 :
1569             enc =kCFStringEncodingDOSChineseSimplif ;
1570             break ;
1571         case wxFONTENCODING_CP949 :
1572             enc = kCFStringEncodingDOSKorean;
1573             break ;
1574         case wxFONTENCODING_CP950 :
1575             enc = kCFStringEncodingDOSChineseTrad;
1576             break ;
1577
1578         case wxFONTENCODING_CP1250 :
1579             enc = kCFStringEncodingWindowsLatin2;
1580             break ;
1581         case wxFONTENCODING_CP1251 :
1582             enc =kCFStringEncodingWindowsCyrillic ;
1583             break ;
1584         case wxFONTENCODING_CP1252 :
1585             enc =kCFStringEncodingWindowsLatin1 ;
1586             break ;
1587         case wxFONTENCODING_CP1253 :
1588             enc = kCFStringEncodingWindowsGreek;
1589             break ;
1590         case wxFONTENCODING_CP1254 :
1591             enc = kCFStringEncodingWindowsLatin5;
1592             break ;
1593         case wxFONTENCODING_CP1255 :
1594             enc =kCFStringEncodingWindowsHebrew ;
1595             break ;
1596         case wxFONTENCODING_CP1256 :
1597             enc =kCFStringEncodingWindowsArabic ;
1598             break ;
1599         case wxFONTENCODING_CP1257 :
1600             enc = kCFStringEncodingWindowsBalticRim;
1601             break ;
1602         case wxFONTENCODING_UTF7 :
1603             enc = kCFStringEncodingNonLossyASCII ;
1604             break ;
1605         case wxFONTENCODING_UTF8 :
1606             enc = kCFStringEncodingUTF8 ;
1607             break ;
1608         case wxFONTENCODING_EUC_JP :
1609             enc = kCFStringEncodingEUC_JP;
1610             break ;
1611         case wxFONTENCODING_UTF16 :
1612             enc = kCFStringEncodingUnicode ;
1613             break ;
1614         case wxFONTENCODING_MACROMAN :
1615             enc = kCFStringEncodingMacRoman ;
1616             break ;
1617         case wxFONTENCODING_MACJAPANESE :
1618             enc = kCFStringEncodingMacJapanese ;
1619             break ;
1620         case wxFONTENCODING_MACCHINESETRAD :
1621             enc = kCFStringEncodingMacChineseTrad ;
1622             break ;
1623         case wxFONTENCODING_MACKOREAN :
1624             enc = kCFStringEncodingMacKorean ;
1625             break ;
1626         case wxFONTENCODING_MACARABIC :
1627             enc = kCFStringEncodingMacArabic ;
1628             break ;
1629         case wxFONTENCODING_MACHEBREW :
1630             enc = kCFStringEncodingMacHebrew ;
1631             break ;
1632         case wxFONTENCODING_MACGREEK :
1633             enc = kCFStringEncodingMacGreek ;
1634             break ;
1635         case wxFONTENCODING_MACCYRILLIC :
1636             enc = kCFStringEncodingMacCyrillic ;
1637             break ;
1638         case wxFONTENCODING_MACDEVANAGARI :
1639             enc = kCFStringEncodingMacDevanagari ;
1640             break ;
1641         case wxFONTENCODING_MACGURMUKHI :
1642             enc = kCFStringEncodingMacGurmukhi ;
1643             break ;
1644         case wxFONTENCODING_MACGUJARATI :
1645             enc = kCFStringEncodingMacGujarati ;
1646             break ;
1647         case wxFONTENCODING_MACORIYA :
1648             enc = kCFStringEncodingMacOriya ;
1649             break ;
1650         case wxFONTENCODING_MACBENGALI :
1651             enc = kCFStringEncodingMacBengali ;
1652             break ;
1653         case wxFONTENCODING_MACTAMIL :
1654             enc = kCFStringEncodingMacTamil ;
1655             break ;
1656         case wxFONTENCODING_MACTELUGU :
1657             enc = kCFStringEncodingMacTelugu ;
1658             break ;
1659         case wxFONTENCODING_MACKANNADA :
1660             enc = kCFStringEncodingMacKannada ;
1661             break ;
1662         case wxFONTENCODING_MACMALAJALAM :
1663             enc = kCFStringEncodingMacMalayalam ;
1664             break ;
1665         case wxFONTENCODING_MACSINHALESE :
1666             enc = kCFStringEncodingMacSinhalese ;
1667             break ;
1668         case wxFONTENCODING_MACBURMESE :
1669             enc = kCFStringEncodingMacBurmese ;
1670             break ;
1671         case wxFONTENCODING_MACKHMER :
1672             enc = kCFStringEncodingMacKhmer ;
1673             break ;
1674         case wxFONTENCODING_MACTHAI :
1675             enc = kCFStringEncodingMacThai ;
1676             break ;
1677         case wxFONTENCODING_MACLAOTIAN :
1678             enc = kCFStringEncodingMacLaotian ;
1679             break ;
1680         case wxFONTENCODING_MACGEORGIAN :
1681             enc = kCFStringEncodingMacGeorgian ;
1682             break ;
1683         case wxFONTENCODING_MACARMENIAN :
1684             enc = kCFStringEncodingMacArmenian ;
1685             break ;
1686         case wxFONTENCODING_MACCHINESESIMP :
1687             enc = kCFStringEncodingMacChineseSimp ;
1688             break ;
1689         case wxFONTENCODING_MACTIBETAN :
1690             enc = kCFStringEncodingMacTibetan ;
1691             break ;
1692         case wxFONTENCODING_MACMONGOLIAN :
1693             enc = kCFStringEncodingMacMongolian ;
1694             break ;
1695         case wxFONTENCODING_MACETHIOPIC :
1696             enc = kCFStringEncodingMacEthiopic ;
1697             break ;
1698         case wxFONTENCODING_MACCENTRALEUR :
1699             enc = kCFStringEncodingMacCentralEurRoman ;
1700             break ;
1701         case wxFONTENCODING_MACVIATNAMESE :
1702             enc = kCFStringEncodingMacVietnamese ;
1703             break ;
1704         case wxFONTENCODING_MACARABICEXT :
1705             enc = kCFStringEncodingMacExtArabic ;
1706             break ;
1707         case wxFONTENCODING_MACSYMBOL :
1708             enc = kCFStringEncodingMacSymbol ;
1709             break ;
1710         case wxFONTENCODING_MACDINGBATS :
1711             enc = kCFStringEncodingMacDingbats ;
1712             break ;
1713         case wxFONTENCODING_MACTURKISH :
1714             enc = kCFStringEncodingMacTurkish ;
1715             break ;
1716         case wxFONTENCODING_MACCROATIAN :
1717             enc = kCFStringEncodingMacCroatian ;
1718             break ;
1719         case wxFONTENCODING_MACICELANDIC :
1720             enc = kCFStringEncodingMacIcelandic ;
1721             break ;
1722         case wxFONTENCODING_MACROMANIAN :
1723             enc = kCFStringEncodingMacRomanian ;
1724             break ;
1725         case wxFONTENCODING_MACCELTIC :
1726             enc = kCFStringEncodingMacCeltic ;
1727             break ;
1728         case wxFONTENCODING_MACGAELIC :
1729             enc = kCFStringEncodingMacGaelic ;
1730             break ;
1731 //      case wxFONTENCODING_MACKEYBOARD :
1732 //          enc = kCFStringEncodingMacKeyboardGlyphs ;
1733 //          break ;
1734         default :
1735             // because gcc is picky
1736             break ;
1737     } ;
1738     return enc ;
1739 }
1740
1741 wxFontEncoding wxFontEncFromCFStringEnc(CFStringEncoding encoding)
1742 {
1743     wxFontEncoding enc = wxFONTENCODING_DEFAULT ;
1744
1745     switch( encoding)
1746     {
1747         case kCFStringEncodingISOLatin1  :
1748             enc = wxFONTENCODING_ISO8859_1 ;
1749             break ;
1750         case kCFStringEncodingISOLatin2 :
1751             enc = wxFONTENCODING_ISO8859_2;
1752             break ;
1753         case kCFStringEncodingISOLatin3 :
1754             enc = wxFONTENCODING_ISO8859_3 ;
1755             break ;
1756         case kCFStringEncodingISOLatin4 :
1757             enc = wxFONTENCODING_ISO8859_4;
1758             break ;
1759         case kCFStringEncodingISOLatinCyrillic :
1760             enc = wxFONTENCODING_ISO8859_5;
1761             break ;
1762         case kCFStringEncodingISOLatinArabic :
1763             enc = wxFONTENCODING_ISO8859_6;
1764             break ;
1765         case kCFStringEncodingISOLatinGreek :
1766             enc = wxFONTENCODING_ISO8859_7;
1767             break ;
1768         case kCFStringEncodingISOLatinHebrew :
1769             enc = wxFONTENCODING_ISO8859_8;
1770             break ;
1771         case kCFStringEncodingISOLatin5 :
1772             enc = wxFONTENCODING_ISO8859_9;
1773             break ;
1774         case kCFStringEncodingISOLatin6 :
1775             enc = wxFONTENCODING_ISO8859_10;
1776             break ;
1777         case kCFStringEncodingISOLatin7 :
1778             enc = wxFONTENCODING_ISO8859_13;
1779             break ;
1780         case kCFStringEncodingISOLatin8 :
1781             enc = wxFONTENCODING_ISO8859_14;
1782             break ;
1783         case kCFStringEncodingISOLatin9 :
1784             enc =wxFONTENCODING_ISO8859_15 ;
1785             break ;
1786
1787         case kCFStringEncodingKOI8_R :
1788             enc = wxFONTENCODING_KOI8;
1789             break ;
1790
1791 //      case  :
1792 //          enc = wxFONTENCODING_BULGARIAN;
1793 //          break ;
1794
1795         case kCFStringEncodingDOSLatinUS :
1796             enc = wxFONTENCODING_CP437;
1797             break ;
1798         case kCFStringEncodingDOSLatin1 :
1799             enc = wxFONTENCODING_CP850;
1800             break ;
1801         case kCFStringEncodingDOSLatin2 :
1802             enc =wxFONTENCODING_CP852 ;
1803             break ;
1804         case kCFStringEncodingDOSCyrillic :
1805             enc = wxFONTENCODING_CP855;
1806             break ;
1807         case kCFStringEncodingDOSRussian :
1808             enc = wxFONTENCODING_CP866;
1809             break ;
1810         case kCFStringEncodingDOSThai :
1811             enc =wxFONTENCODING_CP874 ;
1812             break ;
1813         case kCFStringEncodingDOSJapanese :
1814             enc = wxFONTENCODING_CP932;
1815             break ;
1816         case kCFStringEncodingDOSChineseSimplif :
1817             enc = wxFONTENCODING_CP936;
1818             break ;
1819         case kCFStringEncodingDOSKorean :
1820             enc = wxFONTENCODING_CP949;
1821             break ;
1822         case kCFStringEncodingDOSChineseTrad :
1823             enc = wxFONTENCODING_CP950;
1824             break ;
1825
1826         case kCFStringEncodingWindowsLatin2 :
1827             enc = wxFONTENCODING_CP1250;
1828             break ;
1829         case kCFStringEncodingWindowsCyrillic :
1830             enc = wxFONTENCODING_CP1251;
1831             break ;
1832         case kCFStringEncodingWindowsLatin1 :
1833             enc = wxFONTENCODING_CP1252;
1834             break ;
1835         case kCFStringEncodingWindowsGreek :
1836             enc = wxFONTENCODING_CP1253;
1837             break ;
1838         case kCFStringEncodingWindowsLatin5 :
1839             enc = wxFONTENCODING_CP1254;
1840             break ;
1841         case kCFStringEncodingWindowsHebrew :
1842             enc = wxFONTENCODING_CP1255;
1843             break ;
1844         case kCFStringEncodingWindowsArabic :
1845             enc = wxFONTENCODING_CP1256;
1846             break ;
1847         case kCFStringEncodingWindowsBalticRim :
1848             enc =wxFONTENCODING_CP1257 ;
1849             break ;
1850         case kCFStringEncodingEUC_JP :
1851             enc = wxFONTENCODING_EUC_JP;
1852             break ;
1853         case kCFStringEncodingUnicode :
1854             enc = wxFONTENCODING_UTF16;
1855             break;
1856         case kCFStringEncodingMacRoman :
1857             enc = wxFONTENCODING_MACROMAN ;
1858             break ;
1859         case kCFStringEncodingMacJapanese :
1860             enc = wxFONTENCODING_MACJAPANESE ;
1861             break ;
1862         case kCFStringEncodingMacChineseTrad :
1863             enc = wxFONTENCODING_MACCHINESETRAD ;
1864             break ;
1865         case kCFStringEncodingMacKorean :
1866             enc = wxFONTENCODING_MACKOREAN ;
1867             break ;
1868         case kCFStringEncodingMacArabic :
1869             enc =wxFONTENCODING_MACARABIC ;
1870             break ;
1871         case kCFStringEncodingMacHebrew :
1872             enc = wxFONTENCODING_MACHEBREW ;
1873             break ;
1874         case kCFStringEncodingMacGreek :
1875             enc = wxFONTENCODING_MACGREEK ;
1876             break ;
1877         case kCFStringEncodingMacCyrillic :
1878             enc = wxFONTENCODING_MACCYRILLIC ;
1879             break ;
1880         case kCFStringEncodingMacDevanagari :
1881             enc = wxFONTENCODING_MACDEVANAGARI ;
1882             break ;
1883         case kCFStringEncodingMacGurmukhi :
1884             enc = wxFONTENCODING_MACGURMUKHI ;
1885             break ;
1886         case kCFStringEncodingMacGujarati :
1887             enc = wxFONTENCODING_MACGUJARATI ;
1888             break ;
1889         case kCFStringEncodingMacOriya :
1890             enc =wxFONTENCODING_MACORIYA ;
1891             break ;
1892         case kCFStringEncodingMacBengali :
1893             enc =wxFONTENCODING_MACBENGALI ;
1894             break ;
1895         case kCFStringEncodingMacTamil :
1896             enc = wxFONTENCODING_MACTAMIL ;
1897             break ;
1898         case kCFStringEncodingMacTelugu :
1899             enc = wxFONTENCODING_MACTELUGU ;
1900             break ;
1901         case kCFStringEncodingMacKannada :
1902             enc = wxFONTENCODING_MACKANNADA ;
1903             break ;
1904         case kCFStringEncodingMacMalayalam :
1905             enc = wxFONTENCODING_MACMALAJALAM ;
1906             break ;
1907         case kCFStringEncodingMacSinhalese :
1908             enc = wxFONTENCODING_MACSINHALESE ;
1909             break ;
1910         case kCFStringEncodingMacBurmese :
1911             enc = wxFONTENCODING_MACBURMESE ;
1912             break ;
1913         case kCFStringEncodingMacKhmer :
1914             enc = wxFONTENCODING_MACKHMER ;
1915             break ;
1916         case kCFStringEncodingMacThai :
1917             enc = wxFONTENCODING_MACTHAI ;
1918             break ;
1919         case kCFStringEncodingMacLaotian :
1920             enc = wxFONTENCODING_MACLAOTIAN ;
1921             break ;
1922         case kCFStringEncodingMacGeorgian :
1923             enc = wxFONTENCODING_MACGEORGIAN ;
1924             break ;
1925         case kCFStringEncodingMacArmenian :
1926             enc = wxFONTENCODING_MACARMENIAN ;
1927             break ;
1928         case kCFStringEncodingMacChineseSimp :
1929             enc = wxFONTENCODING_MACCHINESESIMP ;
1930             break ;
1931         case kCFStringEncodingMacTibetan :
1932             enc = wxFONTENCODING_MACTIBETAN ;
1933             break ;
1934         case kCFStringEncodingMacMongolian :
1935             enc = wxFONTENCODING_MACMONGOLIAN ;
1936             break ;
1937         case kCFStringEncodingMacEthiopic :
1938             enc = wxFONTENCODING_MACETHIOPIC ;
1939             break ;
1940         case kCFStringEncodingMacCentralEurRoman:
1941             enc = wxFONTENCODING_MACCENTRALEUR  ;
1942             break ;
1943         case kCFStringEncodingMacVietnamese:
1944             enc = wxFONTENCODING_MACVIATNAMESE  ;
1945             break ;
1946         case kCFStringEncodingMacExtArabic :
1947             enc = wxFONTENCODING_MACARABICEXT ;
1948             break ;
1949         case kCFStringEncodingMacSymbol :
1950             enc = wxFONTENCODING_MACSYMBOL ;
1951             break ;
1952         case kCFStringEncodingMacDingbats :
1953             enc = wxFONTENCODING_MACDINGBATS ;
1954             break ;
1955         case kCFStringEncodingMacTurkish :
1956             enc = wxFONTENCODING_MACTURKISH ;
1957             break ;
1958         case kCFStringEncodingMacCroatian :
1959             enc = wxFONTENCODING_MACCROATIAN ;
1960             break ;
1961         case kCFStringEncodingMacIcelandic :
1962             enc = wxFONTENCODING_MACICELANDIC ;
1963             break ;
1964         case kCFStringEncodingMacRomanian :
1965             enc = wxFONTENCODING_MACROMANIAN ;
1966             break ;
1967         case kCFStringEncodingMacCeltic :
1968             enc = wxFONTENCODING_MACCELTIC ;
1969             break ;
1970         case kCFStringEncodingMacGaelic :
1971             enc = wxFONTENCODING_MACGAELIC ;
1972             break ;
1973 //        case kCFStringEncodingMacKeyboardGlyphs :
1974 //            enc = wxFONTENCODING_MACKEYBOARD ;
1975 //            break ;
1976     } ;
1977     return enc ;
1978 }
1979
1980 class wxMBConv_cocoa : public wxMBConv
1981 {
1982 public:
1983     wxMBConv_cocoa()
1984     {
1985         Init(CFStringGetSystemEncoding()) ;
1986     }
1987
1988     wxMBConv_cocoa(const wxChar* name)
1989     {
1990         Init( wxCFStringEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
1991     }
1992
1993     wxMBConv_cocoa(wxFontEncoding encoding)
1994     {
1995         Init( wxCFStringEncFromFontEnc(encoding) );
1996     }
1997
1998     ~wxMBConv_cocoa()
1999     {
2000     }
2001
2002     void Init( CFStringEncoding encoding)
2003     {
2004         m_char_encoding = encoding ;
2005         m_unicode_encoding = kCFStringEncodingUnicode;
2006     }
2007
2008     size_t MB2WC(wchar_t * szOut, const char * szUnConv, size_t nOutSize) const
2009     {
2010         wxASSERT(szUnConv);
2011
2012         size_t nBufSize = strlen(szUnConv) + 1;
2013         size_t nRealOutSize;
2014
2015         UniChar* szUniCharBuffer    = (UniChar*) szOut;
2016         wchar_t* szConvBuffer       = szOut;
2017
2018         if (szConvBuffer == NULL && nOutSize != 0)
2019         {
2020             szConvBuffer = new wchar_t[nOutSize] ;
2021         }
2022
2023 #if SIZEOF_WCHAR_T == 4
2024         szUniCharBuffer = new UniChar[nOutSize];
2025 #endif
2026
2027         CFDataRef theData = CFDataCreateWithBytesNoCopy (
2028                                             NULL,     //allocator
2029                                             (const UInt8*)szUnConv,
2030                                             nBufSize - 1,
2031                                             NULL      //deallocator
2032                                             );
2033
2034         wxASSERT(theData);
2035
2036         CFStringRef theString = CFStringCreateFromExternalRepresentation (
2037                                                 NULL,
2038                                                 theData,
2039                                                 m_char_encoding
2040                                                 );
2041
2042         wxASSERT(theString);
2043
2044         if (nOutSize == 0)
2045         {
2046             nRealOutSize = CFStringGetLength(theString) + 1;
2047             CFRelease(theString);
2048             return nRealOutSize - 1;
2049         }
2050
2051         CFRange theRange = { 0, CFStringGetLength(theString) };
2052
2053         CFStringGetCharacters(theString, theRange, szUniCharBuffer);
2054
2055
2056         nRealOutSize = (CFStringGetLength(theString) + 1);
2057
2058         CFRelease(theString);
2059
2060         szUniCharBuffer[nRealOutSize-1] = '\0' ;
2061
2062 #if SIZEOF_WCHAR_T == 4
2063         wxMBConvUTF16 converter ;
2064         converter.MB2WC(szConvBuffer  , (const char*)szUniCharBuffer , nRealOutSize ) ;
2065         delete[] szUniCharBuffer;
2066 #endif
2067         if ( szOut == NULL )
2068             delete [] szConvBuffer;
2069
2070         return nRealOutSize ;
2071     }
2072
2073     size_t WC2MB(char *szOut, const wchar_t *szUnConv, size_t nOutSize) const
2074     {
2075         size_t nBufSize = wxWcslen(szUnConv) + 1;
2076         size_t nRealOutSize;
2077         char* szBuffer = szOut;
2078         UniChar* szUniBuffer = (UniChar*) szUnConv;
2079
2080         if (szOut == NULL)
2081         {
2082             // worst case
2083             nRealOutSize = wxString::WorstEncodingCase(nBufSize - 1, *this)+1 ;
2084             szBuffer = new char[ nRealOutSize ] ;
2085         }
2086         else
2087             nRealOutSize = nOutSize;
2088
2089 #if SIZEOF_WCHAR_T == 4
2090         wxMBConvUTF16BE converter ;
2091         nBufSize = converter.WC2MB( NULL , szUnConv , 0 );
2092         szUniBuffer = new UniChar[ (nBufSize / sizeof(UniChar)) + 1] ;
2093         converter.WC2MB( (char*) szUniBuffer , szUnConv, nBufSize + sizeof(UniChar)) ;
2094         nBufSize /= sizeof(UniChar);
2095         ++nBufSize;
2096 #endif
2097
2098         CFStringRef theString = CFStringCreateWithCharactersNoCopy(
2099                                 NULL, //allocator
2100                                 szUniBuffer,
2101                                 nBufSize,
2102                                 NULL //deallocator
2103                             );
2104
2105         wxASSERT(theString);
2106
2107         //Note that CER puts a BOM when converting to unicode
2108         //so we may want to check and use getchars instead in that case
2109         CFDataRef theData = CFStringCreateExternalRepresentation(
2110                                 NULL, //allocator
2111                                 theString,
2112                                 m_char_encoding,
2113                                 0 //what to put in characters that can't be converted -
2114                                     //0 tells CFString to return NULL if it meets such a character
2115                         );
2116
2117         if(!theData)
2118             return (size_t)-1;
2119
2120         CFRelease(theString);
2121
2122         nRealOutSize = CFDataGetLength(theData);
2123
2124         if ( szOut == NULL )
2125             delete[] szBuffer;
2126
2127         if(nOutSize == 0)
2128         {
2129 //TODO: This gets flagged as a non-malloced address by the debugger...
2130 //#if SIZEOF_WCHAR_T == 4
2131 //        delete[] szUniBuffer;
2132 //#endif
2133             CFRelease(theData);
2134             return nRealOutSize - 1;
2135         }
2136
2137         CFRange theRange = {0, CFDataGetLength(theData) };
2138         CFDataGetBytes(theData, theRange, (UInt8*) szBuffer);
2139
2140         CFRelease(theData);
2141
2142 //TODO: This gets flagged as a non-malloced address by the debugger...
2143 //#if SIZEOF_WCHAR_T == 4
2144 //        delete[] szUniBuffer;
2145 //#endif
2146         return  nRealOutSize - 1;
2147     }
2148
2149     bool IsOk() const
2150     {
2151         //TODO: check for invalid en/de/coding
2152         return true;
2153     }
2154
2155 private:
2156     CFStringEncoding m_char_encoding ;
2157     CFStringEncoding m_unicode_encoding ;
2158 };
2159
2160 #endif // defined(__WXCOCOA__)
2161
2162 // ============================================================================
2163 // Mac conversion classes
2164 // ============================================================================
2165
2166 #if defined(__WXMAC__) && defined(TARGET_CARBON)
2167
2168 class wxMBConv_mac : public wxMBConv
2169 {
2170 public:
2171     wxMBConv_mac()
2172     {
2173         Init(CFStringGetSystemEncoding()) ;
2174     }
2175
2176     wxMBConv_mac(const wxChar* name)
2177     {
2178         Init( wxMacGetSystemEncFromFontEnc(wxFontMapper::Get()->CharsetToEncoding(name, false) ) ) ;
2179     }
2180
2181     wxMBConv_mac(wxFontEncoding encoding)
2182     {
2183         Init( wxMacGetSystemEncFromFontEnc(encoding) );
2184     }
2185
2186     ~wxMBConv_mac()
2187     {
2188         OSStatus status = noErr ;
2189         status = TECDisposeConverter(m_MB2WC_converter);
2190         status = TECDisposeConverter(m_WC2MB_converter);
2191     }
2192
2193
2194     void Init( TextEncodingBase encoding)
2195     {
2196         OSStatus status = noErr ;
2197         m_char_encoding = encoding ;
2198         m_unicode_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,0,kUnicode16BitFormat) ;
2199
2200         status = TECCreateConverter(&m_MB2WC_converter,
2201                                     m_char_encoding,
2202                                     m_unicode_encoding);
2203         status = TECCreateConverter(&m_WC2MB_converter,
2204                                     m_unicode_encoding,
2205                                     m_char_encoding);
2206     }
2207
2208     size_t MB2WC(wchar_t *buf, const char *psz, size_t n) const
2209     {
2210         OSStatus status = noErr ;
2211         ByteCount byteOutLen ;
2212         ByteCount byteInLen = strlen(psz) ;
2213         wchar_t *tbuf = NULL ;
2214         UniChar* ubuf = NULL ;
2215         size_t res = 0 ;
2216
2217         if (buf == NULL)
2218         {
2219             n = byteInLen ;
2220             tbuf = (wchar_t*) malloc( n * SIZEOF_WCHAR_T) ;
2221         }
2222         ByteCount byteBufferLen = n * sizeof( UniChar ) ;
2223 #if SIZEOF_WCHAR_T == 4
2224         ubuf = (UniChar*) malloc( byteBufferLen + 2 ) ;
2225 #else
2226         ubuf = (UniChar*) (buf ? buf : tbuf) ;
2227 #endif
2228         status = TECConvertText(m_MB2WC_converter, (ConstTextPtr) psz , byteInLen, &byteInLen,
2229           (TextPtr) ubuf , byteBufferLen, &byteOutLen);
2230 #if SIZEOF_WCHAR_T == 4
2231         // we have to terminate here, because n might be larger for the trailing zero, and if UniChar
2232         // is not properly terminated we get random characters at the end
2233         ubuf[byteOutLen / sizeof( UniChar ) ] = 0 ;
2234         wxMBConvUTF16BE converter ;
2235         res = converter.MB2WC( (buf ? buf : tbuf) , (const char*)ubuf , n ) ;
2236         free( ubuf ) ;
2237 #else
2238         res = byteOutLen / sizeof( UniChar ) ;
2239 #endif
2240         if ( buf == NULL )
2241              free(tbuf) ;
2242
2243         if ( buf  && res < n)
2244             buf[res] = 0;
2245
2246         return res ;
2247     }
2248
2249     size_t WC2MB(char *buf, const wchar_t *psz, size_t n) const
2250     {
2251         OSStatus status = noErr ;
2252         ByteCount byteOutLen ;
2253         ByteCount byteInLen = wxWcslen(psz) * SIZEOF_WCHAR_T ;
2254
2255         char *tbuf = NULL ;
2256
2257         if (buf == NULL)
2258         {
2259             // worst case
2260             n = wxString::WorstEncodingCase(byteInLen / SIZEOF_WCHAR_T, *this) + SIZEOF_WCHAR_T;
2261             tbuf = (char*) malloc( n ) ;
2262         }
2263
2264         ByteCount byteBufferLen = n ;
2265         UniChar* ubuf = NULL ;
2266 #if SIZEOF_WCHAR_T == 4
2267         wxMBConvUTF16BE converter ;
2268         size_t unicharlen = converter.WC2MB( NULL , psz , 0 ) ;
2269         byteInLen = unicharlen ;
2270         ubuf = (UniChar*) malloc( byteInLen + 2 ) ;
2271         converter.WC2MB( (char*) ubuf , psz, unicharlen + 2 ) ;
2272 #else
2273         ubuf = (UniChar*) psz ;
2274 #endif
2275         status = TECConvertText(m_WC2MB_converter, (ConstTextPtr) ubuf , byteInLen, &byteInLen,
2276             (TextPtr) (buf ? buf : tbuf) , byteBufferLen, &byteOutLen);
2277 #if SIZEOF_WCHAR_T == 4
2278         free( ubuf ) ;
2279 #endif
2280         if ( buf == NULL )
2281             free(tbuf) ;
2282
2283         size_t res = byteOutLen ;
2284         if ( buf  && res < n)
2285             buf[res] = 0;
2286
2287         return res ;
2288     }
2289
2290     bool IsOk() const
2291         { return m_MB2WC_converter !=  NULL && m_WC2MB_converter != NULL  ; }
2292
2293 private:
2294     TECObjectRef m_MB2WC_converter ;
2295     TECObjectRef m_WC2MB_converter ;
2296
2297     TextEncodingBase m_char_encoding ;
2298     TextEncodingBase m_unicode_encoding ;
2299 };
2300
2301 #endif // defined(__WXMAC__) && defined(TARGET_CARBON)
2302
2303 // ============================================================================
2304 // wxEncodingConverter based conversion classes
2305 // ============================================================================
2306
2307 #if wxUSE_FONTMAP
2308
2309 class wxMBConv_wxwin : public wxMBConv
2310 {
2311 private:
2312     void Init()
2313     {
2314         m_ok = m2w.Init(m_enc, wxFONTENCODING_UNICODE) &&
2315                w2m.Init(wxFONTENCODING_UNICODE, m_enc);
2316     }
2317
2318 public:
2319     // temporarily just use wxEncodingConverter stuff,
2320     // so that it works while a better implementation is built
2321     wxMBConv_wxwin(const wxChar* name)
2322     {
2323         if (name)
2324             m_enc = wxFontMapper::Get()->CharsetToEncoding(name, false);
2325         else
2326             m_enc = wxFONTENCODING_SYSTEM;
2327
2328         Init();
2329     }
2330
2331     wxMBConv_wxwin(wxFontEncoding enc)
2332     {
2333         m_enc = enc;
2334
2335         Init();
2336     }
2337
2338     size_t MB2WC(wchar_t *buf, const char *psz, size_t WXUNUSED(n)) const
2339     {
2340         size_t inbuf = strlen(psz);
2341         if (buf)
2342             m2w.Convert(psz,buf);
2343         return inbuf;
2344     }
2345
2346     size_t WC2MB(char *buf, const wchar_t *psz, size_t WXUNUSED(n)) const
2347     {
2348         const size_t inbuf = wxWcslen(psz);
2349         if (buf)
2350             w2m.Convert(psz,buf);
2351
2352         return inbuf;
2353     }
2354
2355     bool IsOk() const { return m_ok; }
2356
2357 public:
2358     wxFontEncoding m_enc;
2359     wxEncodingConverter m2w, w2m;
2360
2361     // were we initialized successfully?
2362     bool m_ok;
2363
2364     DECLARE_NO_COPY_CLASS(wxMBConv_wxwin)
2365 };
2366
2367 #endif // wxUSE_FONTMAP
2368
2369 // ============================================================================
2370 // wxCSConv implementation
2371 // ============================================================================
2372
2373 void wxCSConv::Init()
2374 {
2375     m_name = NULL;
2376     m_convReal =  NULL;
2377     m_deferred = true;
2378 }
2379
2380 wxCSConv::wxCSConv(const wxChar *charset)
2381 {
2382     Init();
2383
2384     if ( charset )
2385     {
2386         SetName(charset);
2387     }
2388
2389     m_encoding = wxFONTENCODING_SYSTEM;
2390 }
2391
2392 wxCSConv::wxCSConv(wxFontEncoding encoding)
2393 {
2394     if ( encoding == wxFONTENCODING_MAX || encoding == wxFONTENCODING_DEFAULT )
2395     {
2396         wxFAIL_MSG( _T("invalid encoding value in wxCSConv ctor") );
2397
2398         encoding = wxFONTENCODING_SYSTEM;
2399     }
2400
2401     Init();
2402
2403     m_encoding = encoding;
2404 }
2405
2406 wxCSConv::~wxCSConv()
2407 {
2408     Clear();
2409 }
2410
2411 wxCSConv::wxCSConv(const wxCSConv& conv)
2412         : wxMBConv()
2413 {
2414     Init();
2415
2416     SetName(conv.m_name);
2417     m_encoding = conv.m_encoding;
2418 }
2419
2420 wxCSConv& wxCSConv::operator=(const wxCSConv& conv)
2421 {
2422     Clear();
2423
2424     SetName(conv.m_name);
2425     m_encoding = conv.m_encoding;
2426
2427     return *this;
2428 }
2429
2430 void wxCSConv::Clear()
2431 {
2432     free(m_name);
2433     delete m_convReal;
2434
2435     m_name = NULL;
2436     m_convReal = NULL;
2437 }
2438
2439 void wxCSConv::SetName(const wxChar *charset)
2440 {
2441     if (charset)
2442     {
2443         m_name = wxStrdup(charset);
2444         m_deferred = true;
2445     }
2446 }
2447
2448 wxMBConv *wxCSConv::DoCreate() const
2449 {
2450     // check for the special case of ASCII or ISO8859-1 charset: as we have
2451     // special knowledge of it anyhow, we don't need to create a special
2452     // conversion object
2453     if ( m_encoding == wxFONTENCODING_ISO8859_1 )
2454     {
2455         // don't convert at all
2456         return NULL;
2457     }
2458
2459     // we trust OS to do conversion better than we can so try external
2460     // conversion methods first
2461     //
2462     // the full order is:
2463     //      1. OS conversion (iconv() under Unix or Win32 API)
2464     //      2. hard coded conversions for UTF
2465     //      3. wxEncodingConverter as fall back
2466
2467     // step (1)
2468 #ifdef HAVE_ICONV
2469 #if !wxUSE_FONTMAP
2470     if ( m_name )
2471 #endif // !wxUSE_FONTMAP
2472     {
2473         wxString name(m_name);
2474
2475 #if wxUSE_FONTMAP
2476         if ( name.empty() )
2477             name = wxFontMapper::Get()->GetEncodingName(m_encoding);
2478 #endif // wxUSE_FONTMAP
2479
2480         wxMBConv_iconv *conv = new wxMBConv_iconv(name);
2481         if ( conv->IsOk() )
2482             return conv;
2483
2484         delete conv;
2485     }
2486 #endif // HAVE_ICONV
2487
2488 #ifdef wxHAVE_WIN32_MB2WC
2489     {
2490 #if wxUSE_FONTMAP
2491         wxMBConv_win32 *conv = m_name ? new wxMBConv_win32(m_name)
2492                                       : new wxMBConv_win32(m_encoding);
2493         if ( conv->IsOk() )
2494             return conv;
2495
2496         delete conv;
2497 #else
2498         return NULL;
2499 #endif
2500     }
2501 #endif // wxHAVE_WIN32_MB2WC
2502 #if defined(__WXMAC__)
2503     {
2504         if ( m_name || ( m_encoding < wxFONTENCODING_UTF16BE ) )
2505         {
2506
2507             wxMBConv_mac *conv = m_name ? new wxMBConv_mac(m_name)
2508                                         : new wxMBConv_mac(m_encoding);
2509             if ( conv->IsOk() )
2510                  return conv;
2511
2512             delete conv;
2513         }
2514     }
2515 #endif
2516 #if defined(__WXCOCOA__)
2517     {
2518         if ( m_name || ( m_encoding <= wxFONTENCODING_UTF16 ) )
2519         {
2520
2521             wxMBConv_cocoa *conv = m_name ? new wxMBConv_cocoa(m_name)
2522                                           : new wxMBConv_cocoa(m_encoding);
2523             if ( conv->IsOk() )
2524                  return conv;
2525
2526             delete conv;
2527         }
2528     }
2529 #endif
2530     // step (2)
2531     wxFontEncoding enc = m_encoding;
2532 #if wxUSE_FONTMAP
2533     if ( enc == wxFONTENCODING_SYSTEM && m_name )
2534     {
2535         // use "false" to suppress interactive dialogs -- we can be called from
2536         // anywhere and popping up a dialog from here is the last thing we want to
2537         // do
2538         enc = wxFontMapper::Get()->CharsetToEncoding(m_name, false);
2539     }
2540 #endif // wxUSE_FONTMAP
2541
2542     switch ( enc )
2543     {
2544         case wxFONTENCODING_UTF7:
2545              return new wxMBConvUTF7;
2546
2547         case wxFONTENCODING_UTF8:
2548              return new wxMBConvUTF8;
2549
2550         case wxFONTENCODING_UTF16BE:
2551              return new wxMBConvUTF16BE;
2552
2553         case wxFONTENCODING_UTF16LE:
2554              return new wxMBConvUTF16LE;
2555
2556         case wxFONTENCODING_UTF32BE:
2557              return new wxMBConvUTF32BE;
2558
2559         case wxFONTENCODING_UTF32LE:
2560              return new wxMBConvUTF32LE;
2561
2562         default:
2563              // nothing to do but put here to suppress gcc warnings
2564              ;
2565     }
2566
2567     // step (3)
2568 #if wxUSE_FONTMAP
2569     {
2570         wxMBConv_wxwin *conv = m_name ? new wxMBConv_wxwin(m_name)
2571                                       : new wxMBConv_wxwin(m_encoding);
2572         if ( conv->IsOk() )
2573             return conv;
2574
2575         delete conv;
2576     }
2577 #endif // wxUSE_FONTMAP
2578
2579     // NB: This is a hack to prevent deadlock. What could otherwise happen
2580     //     in Unicode build: wxConvLocal creation ends up being here
2581     //     because of some failure and logs the error. But wxLog will try to
2582     //     attach timestamp, for which it will need wxConvLocal (to convert
2583     //     time to char* and then wchar_t*), but that fails, tries to log
2584     //     error, but wxLog has a (already locked) critical section that
2585     //     guards static buffer.
2586     static bool alreadyLoggingError = false;
2587     if (!alreadyLoggingError)
2588     {
2589         alreadyLoggingError = true;
2590         wxLogError(_("Cannot convert from the charset '%s'!"),
2591                    m_name ? m_name
2592                       :
2593 #if wxUSE_FONTMAP
2594                          wxFontMapper::GetEncodingDescription(m_encoding).c_str()
2595 #else // !wxUSE_FONTMAP
2596                          wxString::Format(_("encoding %s"), m_encoding).c_str()
2597 #endif // wxUSE_FONTMAP/!wxUSE_FONTMAP
2598               );
2599         alreadyLoggingError = false;
2600     }
2601
2602     return NULL;
2603 }
2604
2605 void wxCSConv::CreateConvIfNeeded() const
2606 {
2607     if ( m_deferred )
2608     {
2609         wxCSConv *self = (wxCSConv *)this; // const_cast
2610
2611 #if wxUSE_INTL
2612         // if we don't have neither the name nor the encoding, use the default
2613         // encoding for this system
2614         if ( !m_name && m_encoding == wxFONTENCODING_SYSTEM )
2615         {
2616             self->m_name = wxStrdup(wxLocale::GetSystemEncodingName());
2617         }
2618 #endif // wxUSE_INTL
2619
2620         self->m_convReal = DoCreate();
2621         self->m_deferred = false;
2622     }
2623 }
2624
2625 size_t wxCSConv::MB2WC(wchar_t *buf, const char *psz, size_t n) const
2626 {
2627     CreateConvIfNeeded();
2628
2629     if (m_convReal)
2630         return m_convReal->MB2WC(buf, psz, n);
2631
2632     // latin-1 (direct)
2633     size_t len = strlen(psz);
2634
2635     if (buf)
2636     {
2637         for (size_t c = 0; c <= len; c++)
2638             buf[c] = (unsigned char)(psz[c]);
2639     }
2640
2641     return len;
2642 }
2643
2644 size_t wxCSConv::WC2MB(char *buf, const wchar_t *psz, size_t n) const
2645 {
2646     CreateConvIfNeeded();
2647
2648     if (m_convReal)
2649         return m_convReal->WC2MB(buf, psz, n);
2650
2651     // latin-1 (direct)
2652     const size_t len = wxWcslen(psz);
2653     if (buf)
2654     {
2655         for (size_t c = 0; c <= len; c++)
2656         {
2657             if (psz[c] > 0xFF)
2658                 return (size_t)-1;
2659             buf[c] = (char)psz[c];
2660         }
2661     }
2662     else
2663     {
2664         for (size_t c = 0; c <= len; c++)
2665         {
2666             if (psz[c] > 0xFF)
2667                 return (size_t)-1;
2668         }
2669     }
2670
2671     return len;
2672 }
2673
2674 // ----------------------------------------------------------------------------
2675 // globals
2676 // ----------------------------------------------------------------------------
2677
2678 #ifdef __WINDOWS__
2679     static wxMBConv_win32 wxConvLibcObj;
2680 #elif defined(__WXMAC__) && !defined(__MACH__)
2681     static wxMBConv_mac wxConvLibcObj ;
2682 #else
2683     static wxMBConvLibc wxConvLibcObj;
2684 #endif
2685
2686 static wxCSConv wxConvLocalObj(wxFONTENCODING_SYSTEM);
2687 static wxCSConv wxConvISO8859_1Obj(wxFONTENCODING_ISO8859_1);
2688 static wxMBConvUTF7 wxConvUTF7Obj;
2689 static wxMBConvUTF8 wxConvUTF8Obj;
2690
2691
2692 WXDLLIMPEXP_DATA_BASE(wxMBConv&) wxConvLibc = wxConvLibcObj;
2693 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvLocal = wxConvLocalObj;
2694 WXDLLIMPEXP_DATA_BASE(wxCSConv&) wxConvISO8859_1 = wxConvISO8859_1Obj;
2695 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF7&) wxConvUTF7 = wxConvUTF7Obj;
2696 WXDLLIMPEXP_DATA_BASE(wxMBConvUTF8&) wxConvUTF8 = wxConvUTF8Obj;
2697 WXDLLIMPEXP_DATA_BASE(wxMBConv *) wxConvCurrent = &wxConvLibcObj;
2698
2699 #else // !wxUSE_WCHAR_T
2700
2701 // stand-ins in absence of wchar_t
2702 WXDLLIMPEXP_DATA_BASE(wxMBConv) wxConvLibc,
2703                                 wxConvISO8859_1,
2704                                 wxConvLocal,
2705                                 wxConvUTF8;
2706
2707 #endif // wxUSE_WCHAR_T/!wxUSE_WCHAR_T
2708
2709